Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy Storchaka.

This commit is contained in:
Antoine Pitrou 2012-05-10 16:36:02 +02:00
parent fda08b0860
commit ca5f91b888
8 changed files with 336 additions and 572 deletions

View file

@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy
Storchaka.
- Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks - Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks
in old-style string formatting. in old-style string formatting.

View file

@ -7,6 +7,7 @@
#define STRINGLIB(F) asciilib_##F #define STRINGLIB(F) asciilib_##F
#define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1 #define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0x7Fu
#define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"

View file

@ -15,19 +15,18 @@
# error C 'long' size should be either 4 or 8! # error C 'long' size should be either 4 or 8!
#endif #endif
Py_LOCAL_INLINE(int) Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_try_decode)(const char *start, const char *end, STRINGLIB(utf8_decode)(const char **inptr, const char *end,
STRINGLIB_CHAR *dest, STRINGLIB_CHAR *dest,
const char **src_pos, Py_ssize_t *dest_index) Py_ssize_t *outpos)
{ {
int ret; Py_UCS4 ch;
Py_ssize_t n; const char *s = *inptr;
const char *s = start;
const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK); const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
STRINGLIB_CHAR *p = dest; STRINGLIB_CHAR *p = dest + *outpos;
while (s < end) { while (s < end) {
Py_UCS4 ch = (unsigned char)*s; ch = (unsigned char)*s;
if (ch < 0x80) { if (ch < 0x80) {
/* Fast path for runs of ASCII characters. Given that common UTF-8 /* Fast path for runs of ASCII characters. Given that common UTF-8
@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
unsigned long value = *(unsigned long *) _s; unsigned long value = *(unsigned long *) _s;
if (value & ASCII_CHAR_MASK) if (value & ASCII_CHAR_MASK)
break; break;
_p[0] = _s[0]; #ifdef BYTEORDER_IS_LITTLE_ENDIAN
_p[1] = _s[1]; _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
_p[2] = _s[2]; _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[3] = _s[3]; _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
#if (SIZEOF_LONG == 8) _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
_p[4] = _s[4]; # if SIZEOF_LONG == 8
_p[5] = _s[5]; _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
_p[6] = _s[6]; _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
_p[7] = _s[7]; _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
_p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
# endif
#else
# if SIZEOF_LONG == 8
_p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
_p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
_p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
_p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
_p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
_p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
# else
_p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
_p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
# endif
#endif #endif
_s += SIZEOF_LONG; _s += SIZEOF_LONG;
_p += SIZEOF_LONG; _p += SIZEOF_LONG;
@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
break; break;
ch = (unsigned char)*s; ch = (unsigned char)*s;
} }
if (ch < 0x80) {
s++;
*p++ = ch;
continue;
}
} }
if (ch < 0x80) { if (ch < 0xC2) {
s++; /* invalid sequence
\x80-\xBF -- continuation byte
\xC0-\xC1 -- fake 0000-007F */
goto InvalidStart;
}
if (ch < 0xE0) {
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
Py_UCS4 ch2;
if (end - s < 2) {
/* unexpected end of data: the caller will decide whether
it's an error or not */
break;
}
ch2 = (unsigned char)s[1];
if ((ch2 & 0xC0) != 0x80)
/* invalid continuation byte */
goto InvalidContinuation;
ch = (ch << 6) + ch2 -
((0xC0 << 6) + 0x80);
assert ((ch > 0x007F) && (ch <= 0x07FF));
s += 2;
if (STRINGLIB_MAX_CHAR <= 0x007F ||
(STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
goto Overflow;
*p++ = ch; *p++ = ch;
continue; continue;
} }
n = utf8_code_length[ch]; if (ch < 0xF0) {
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
if (s + n > end) { Py_UCS4 ch2, ch3;
/* unexpected end of data: the caller will decide whether if (end - s < 3) {
it's an error or not */ /* unexpected end of data: the caller will decide whether
goto _error; it's an error or not */
} break;
switch (n) {
case 0:
/* invalid start byte */
goto _error;
case 1:
/* internal error */
goto _error;
case 2:
if ((s[1] & 0xc0) != 0x80)
/* invalid continuation byte */
goto _error;
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
s += 2;
*p++ = ch;
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
/* invalid continuation byte */
goto _error;
} }
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2];
if ((ch2 & 0xC0) != 0x80 ||
(ch3 & 0xC0) != 0x80) {
/* invalid continuation byte */
goto InvalidContinuation;
}
if (ch == 0xE0) {
if (ch2 < 0xA0)
/* invalid sequence
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
goto InvalidContinuation;
}
else if (ch == 0xED && ch2 > 0x9F) {
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
will result in surrogates in range D800-DFFF. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
goto InvalidContinuation;
}
ch = (ch << 12) + (ch2 << 6) + ch3 -
((0xE0 << 12) + (0x80 << 6) + 0x80);
assert ((ch > 0x07FF) && (ch <= 0xFFFF)); assert ((ch > 0x07FF) && (ch <= 0xFFFF));
s += 3; s += 3;
if (STRINGLIB_MAX_CHAR <= 0x07FF ||
(STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
goto Overflow;
*p++ = ch; *p++ = ch;
break; continue;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
/* invalid continuation byte */
goto _error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
s += 4;
*p++ = ch;
break;
} }
if (ch < 0xF5) {
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
Py_UCS4 ch2, ch3, ch4;
if (end - s < 4) {
/* unexpected end of data: the caller will decide whether
it's an error or not */
break;
}
ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2];
ch4 = (unsigned char)s[3];
if ((ch2 & 0xC0) != 0x80 ||
(ch3 & 0xC0) != 0x80 ||
(ch4 & 0xC0) != 0x80) {
/* invalid continuation byte */
goto InvalidContinuation;
}
if (ch == 0xF0) {
if (ch2 < 0x90)
/* invalid sequence
\xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
goto InvalidContinuation;
}
else if (ch == 0xF4 && ch2 > 0x8F) {
/* invalid sequence
\xF4\x90\x80\80- -- 110000- overflow */
goto InvalidContinuation;
}
ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
s += 4;
if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
(STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
goto Overflow;
*p++ = ch;
continue;
}
goto InvalidStart;
} }
ret = 0; ch = 0;
goto _ok; Overflow:
_error: Return:
ret = -1; *inptr = s;
_ok: *outpos = p - dest;
*src_pos = s; return ch;
*dest_index = p - dest; InvalidStart:
return ret; ch = 1;
goto Return;
InvalidContinuation:
ch = 2;
goto Return;
} }
#undef LONG_PTR_MASK #undef LONG_PTR_MASK

View file

@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs1lib_##F #define STRINGLIB(F) ucs1lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1 #define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0xFFu
#define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"

View file

@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs2lib_##F #define STRINGLIB(F) ucs2lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 2 #define STRINGLIB_SIZEOF_CHAR 2
#define STRINGLIB_MAX_CHAR 0xFFFFu
#define STRINGLIB_CHAR Py_UCS2 #define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"

View file

@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs4lib_##F #define STRINGLIB(F) ucs4lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 4 #define STRINGLIB_SIZEOF_CHAR 4
#define STRINGLIB_MAX_CHAR 0x10FFFFu
#define STRINGLIB_CHAR Py_UCS4 #define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"

View file

@ -1,6 +1,7 @@
#undef FASTSEARCH #undef FASTSEARCH
#undef STRINGLIB #undef STRINGLIB
#undef STRINGLIB_SIZEOF_CHAR #undef STRINGLIB_SIZEOF_CHAR
#undef STRINGLIB_MAX_CHAR
#undef STRINGLIB_CHAR #undef STRINGLIB_CHAR
#undef STRINGLIB_STR #undef STRINGLIB_STR
#undef STRINGLIB_LEN #undef STRINGLIB_LEN

View file

@ -4615,28 +4615,6 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s,
/* --- UTF-8 Codec -------------------------------------------------------- */ /* --- UTF-8 Codec -------------------------------------------------------- */
static
char utf8_code_length[256] = {
/* Map UTF-8 encoded prefix byte to sequence length. Zero means
illegal prefix. See RFC 3629 for details */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
};
PyObject * PyObject *
PyUnicode_DecodeUTF8(const char *s, PyUnicode_DecodeUTF8(const char *s,
Py_ssize_t size, Py_ssize_t size,
@ -4645,6 +4623,10 @@ PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
} }
#include "stringlib/asciilib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "stringlib/ucs1lib.h" #include "stringlib/ucs1lib.h"
#include "stringlib/codecs.h" #include "stringlib/codecs.h"
#include "stringlib/undef.h" #include "stringlib/undef.h"
@ -4670,310 +4652,60 @@ PyUnicode_DecodeUTF8(const char *s,
# error C 'long' size should be either 4 or 8! # error C 'long' size should be either 4 or 8!
#endif #endif
/* Scans a UTF-8 string and returns the maximum character to be expected static Py_ssize_t
and the size of the decoded unicode string. ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
This function doesn't check for errors, these checks are performed in
PyUnicode_DecodeUTF8Stateful.
*/
static Py_UCS4
utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
{ {
Py_ssize_t char_count = 0; const char *p = start;
const unsigned char *end = p + string_size; const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
assert(unicode_size != NULL); #if SIZEOF_LONG <= SIZEOF_VOID_P
assert(!((size_t) dest & LONG_PTR_MASK));
/* By having a cascade of independent loops which fallback onto each if (!((size_t) p & LONG_PTR_MASK)) {
other, we minimize the amount of work done in the average loop /* Fast path, see in STRINGLIB(utf8_decode) for
iteration, and we also maximize the CPU's ability to predict an explanation. */
branches correctly (because a given condition will have always the /* Help register allocation */
same boolean outcome except perhaps in the last iteration of the register const char *_p = p;
corresponding loop). register Py_UCS1 * q = dest;
In the general case this brings us rather close to decoding while (_p < aligned_end) {
performance pre-PEP 393, despite the two-pass decoding. unsigned long value = *(const unsigned long *) _p;
if (value & ASCII_CHAR_MASK)
Note that the pure ASCII loop is not duplicated once a non-ASCII
character has been encountered. It is actually a pessimization (by
a significant factor) to use this loop on text with many non-ASCII
characters, and it is important to avoid bad performance on valid
utf-8 data (invalid utf-8 being a different can of worms).
*/
/* ASCII */
for (; p < end; ++p) {
/* Only check value if it's not a ASCII char... */
if (*p < 0x80) {
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
an explanation. */
if (!((size_t) p & LONG_PTR_MASK)) {
/* Help register allocation */
register const unsigned char *_p = p;
while (_p < aligned_end) {
unsigned long value = *(unsigned long *) _p;
if (value & ASCII_CHAR_MASK)
break;
_p += SIZEOF_LONG;
char_count += SIZEOF_LONG;
}
p = _p;
if (p == end)
break;
}
}
if (*p < 0x80)
++char_count;
else
goto _ucs1loop;
}
*unicode_size = char_count;
return 127;
_ucs1loop:
for (; p < end; ++p) {
if (*p < 0xc4)
char_count += ((*p & 0xc0) != 0x80);
else
goto _ucs2loop;
}
*unicode_size = char_count;
return 255;
_ucs2loop:
for (; p < end; ++p) {
if (*p < 0xf0)
char_count += ((*p & 0xc0) != 0x80);
else
goto _ucs4loop;
}
*unicode_size = char_count;
return 65535;
_ucs4loop:
for (; p < end; ++p) {
char_count += ((*p & 0xc0) != 0x80);
}
*unicode_size = char_count;
return 65537;
}
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
in case of errors. Implicit parameters: unicode, kind, data, onError.
Potential resizing overallocates, so the result needs to shrink at the end.
*/
#define WRITE_MAYBE_FAIL(index, value) \
do { \
Py_ssize_t pos = index; \
if (pos > PyUnicode_GET_LENGTH(unicode) && \
unicode_resize(&unicode, pos + pos/8) < 0) \
goto onError; \
if (unicode_putchar(&unicode, &pos, value) < 0) \
goto onError; \
} while (0)
static PyObject *
decode_utf8_errors(const char *starts,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char *s,
PyObject *unicode,
Py_ssize_t i)
{
int n;
int k;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *e = starts + size;
const char *aligned_end;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
/* Fast path for runs of ASCII characters. Given that common UTF-8
input will consist of an overwhelming majority of ASCII
characters, we try to optimize for this case by checking
as many characters as a C 'long' can contain.
First, check if we can do an aligned read, as most CPUs have
a penalty for unaligned reads.
*/
if (!((size_t) s & LONG_PTR_MASK)) {
/* Help register allocation */
register const char *_s = s;
register Py_ssize_t _i = i;
while (_s < aligned_end) {
/* Read a whole long at a time (either 4 or 8 bytes),
and do a fast unrolled copy if it only contains ASCII
characters. */
unsigned long value = *(unsigned long *) _s;
if (value & ASCII_CHAR_MASK)
break;
WRITE_MAYBE_FAIL(_i+0, _s[0]);
WRITE_MAYBE_FAIL(_i+1, _s[1]);
WRITE_MAYBE_FAIL(_i+2, _s[2]);
WRITE_MAYBE_FAIL(_i+3, _s[3]);
#if (SIZEOF_LONG == 8)
WRITE_MAYBE_FAIL(_i+4, _s[4]);
WRITE_MAYBE_FAIL(_i+5, _s[5]);
WRITE_MAYBE_FAIL(_i+6, _s[6]);
WRITE_MAYBE_FAIL(_i+7, _s[7]);
#endif
_s += SIZEOF_LONG;
_i += SIZEOF_LONG;
}
s = _s;
i = _i;
if (s == e)
break;
ch = (unsigned char)*s;
}
}
if (ch < 0x80) {
WRITE_MAYBE_FAIL(i++, ch);
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
if (consumed)
break; break;
else { *((unsigned long *)q) = value;
errmsg = "unexpected end of data"; _p += SIZEOF_LONG;
startinpos = s-starts; q += SIZEOF_LONG;
endinpos = startinpos+1;
for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
endinpos++;
goto utf8Error;
}
} }
p = _p;
switch (n) { while (p < end) {
if ((unsigned char)*p & 0x80)
case 0: break;
errmsg = "invalid start byte"; *q++ = *p++;
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 1:
errmsg = "internal error";
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 2:
if ((s[1] & 0xc0) != 0x80) {
errmsg = "invalid continuation byte";
startinpos = s-starts;
endinpos = startinpos + 1;
goto utf8Error;
}
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
WRITE_MAYBE_FAIL(i++, ch);
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
errmsg = "invalid continuation byte";
startinpos = s-starts;
endinpos = startinpos + 1;
/* if s[1] first two bits are 1 and 0, then the invalid
continuation byte is s[2], so increment endinpos by 1,
if not, s[1] is invalid and endinpos doesn't need to
be incremented. */
if ((s[1] & 0xC0) == 0x80)
endinpos++;
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
WRITE_MAYBE_FAIL(i++, ch);
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
errmsg = "invalid continuation byte";
startinpos = s-starts;
endinpos = startinpos + 1;
if ((s[1] & 0xC0) == 0x80) {
endinpos++;
if ((s[2] & 0xC0) == 0x80)
endinpos++;
}
goto utf8Error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
WRITE_MAYBE_FAIL(i++, ch);
break;
} }
s += n; return p - start;
continue;
utf8Error:
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf-8", errmsg,
&starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &i))
goto onError;
/* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
} }
if (consumed) #endif
*consumed = s-starts; while (p < end) {
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
/* Adjust length and ready string when it contained errors and for an explanation. */
is of the old resizable kind. */ if (!((size_t) p & LONG_PTR_MASK)) {
if (unicode_resize(&unicode, i) < 0) /* Help register allocation */
goto onError; register const char *_p = p;
unicode_adjust_maxchar(&unicode); while (_p < aligned_end) {
if (unicode == NULL) unsigned long value = *(unsigned long *) _p;
goto onError; if (value & ASCII_CHAR_MASK)
break;
Py_XDECREF(errorHandler); _p += SIZEOF_LONG;
Py_XDECREF(exc); }
assert(_PyUnicode_CheckConsistency(unicode, 1)); p = _p;
return unicode; if (_p == end)
break;
onError: }
Py_XDECREF(errorHandler); if ((unsigned char)*p & 0x80)
Py_XDECREF(exc); break;
Py_XDECREF(unicode); ++p;
return NULL; }
memcpy(dest, start, p - start);
return p - start;
} }
#undef WRITE_MAYBE_FAIL
PyObject * PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s, PyUnicode_DecodeUTF8Stateful(const char *s,
@ -4981,15 +4713,16 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
const char *errors, const char *errors,
Py_ssize_t *consumed) Py_ssize_t *consumed)
{ {
Py_UCS4 maxchar = 0;
Py_ssize_t unicode_size;
int has_errors = 0;
PyObject *unicode; PyObject *unicode;
int kind;
void *data;
const char *starts = s; const char *starts = s;
const char *e; const char *end = s + size;
Py_ssize_t i; Py_ssize_t outpos;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
if (size == 0) { if (size == 0) {
if (consumed) if (consumed)
@ -4998,49 +4731,91 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
return unicode_empty; return unicode_empty;
} }
maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size); /* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if (maxchar < 128 && size == unicode_size) {
if (consumed) if (consumed)
*consumed = size; *consumed = 1;
return unicode_fromascii((const unsigned char *)s, size); return get_latin1_char((unsigned char)s[0]);
} }
unicode = PyUnicode_New(unicode_size, maxchar); unicode = PyUnicode_New(size, 127);
if (!unicode) if (!unicode)
return NULL; return NULL;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */ outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
i = 0; s += outpos;
e = starts + size; while (s < end) {
switch (kind) { Py_UCS4 ch;
case PyUnicode_1BYTE_KIND: int kind = PyUnicode_KIND(unicode);
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i); if (kind == PyUnicode_1BYTE_KIND) {
break; if (PyUnicode_IS_ASCII(unicode))
case PyUnicode_2BYTE_KIND: ch = asciilib_utf8_decode(&s, end,
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i); PyUnicode_1BYTE_DATA(unicode), &outpos);
break; else
case PyUnicode_4BYTE_KIND: ch = ucs1lib_utf8_decode(&s, end,
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i); PyUnicode_1BYTE_DATA(unicode), &outpos);
break; } else if (kind == PyUnicode_2BYTE_KIND) {
} ch = ucs2lib_utf8_decode(&s, end,
if (!has_errors) { PyUnicode_2BYTE_DATA(unicode), &outpos);
/* Ensure the unicode size calculation was correct */ } else {
assert(i == unicode_size); assert(kind == PyUnicode_4BYTE_KIND);
assert(s == e); ch = ucs4lib_utf8_decode(&s, end,
if (consumed) PyUnicode_4BYTE_DATA(unicode), &outpos);
*consumed = size; }
return unicode;
switch (ch) {
case 0:
if (s == end || consumed)
goto End;
errmsg = "unexpected end of data";
startinpos = s - starts;
endinpos = startinpos + 1;
while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
endinpos++;
break;
case 1:
errmsg = "invalid start byte";
startinpos = s - starts;
endinpos = startinpos + 1;
break;
case 2:
errmsg = "invalid continuation byte";
startinpos = s - starts;
endinpos = startinpos + 1;
while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
endinpos++;
break;
default:
if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
continue;
}
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf-8", errmsg,
&starts, &end, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos))
goto onError;
} }
/* In case of errors, maxchar and size computation might be incorrect; End:
code below refits and resizes as necessary. */ if (unicode_resize(&unicode, outpos) < 0)
return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i); goto onError;
if (consumed)
*consumed = s - starts;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
onError:
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_XDECREF(unicode);
return NULL;
} }
#ifdef __APPLE__ #ifdef __APPLE__
@ -5051,9 +4826,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
wchar_t* wchar_t*
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
{ {
int n;
const char *e; const char *e;
wchar_t *unicode, *p; wchar_t *unicode;
Py_ssize_t outpos;
/* Note: size will always be longer than the resulting Unicode /* Note: size will always be longer than the resulting Unicode
character count */ character count */
@ -5066,86 +4841,33 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
return NULL; return NULL;
/* Unpack UTF-8 encoded data */ /* Unpack UTF-8 encoded data */
p = unicode;
e = s + size; e = s + size;
outpos = 0;
while (s < e) { while (s < e) {
Py_UCS4 ch = (unsigned char)*s; Py_UCS4 ch;
if (ch < 0x80) {
*p++ = (wchar_t)ch;
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
goto surrogateescape;
}
switch (n) {
case 0:
case 1:
goto surrogateescape;
case 2:
if ((s[1] & 0xc0) != 0x80)
goto surrogateescape;
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
*p++ = (wchar_t)ch;
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
goto surrogateescape;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
*p++ = (wchar_t)ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
goto surrogateescape;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
#if SIZEOF_WCHAR_T == 4 #if SIZEOF_WCHAR_T == 4
*p++ = (wchar_t)ch; ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
#else #else
/* compute and append the two surrogates: */ ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
*p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); #endif
*p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); if (ch > 0xFF) {
#if SIZEOF_WCHAR_T == 4
assert(0);
#else
assert(Py_UNICODE_IS_SURROGATE(ch));
/* compute and append the two surrogates: */
unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
#endif #endif
break;
} }
s += n; else {
continue; if (!ch && s == e)
break;
surrogateescape: /* surrogateescape */
*p++ = 0xDC00 + ch; unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
s++; }
} }
*p = L'\0'; unicode[outpos] = L'\0';
return unicode; return unicode;
} }
@ -6970,17 +6692,13 @@ PyUnicode_DecodeASCII(const char *s,
const char *errors) const char *errors)
{ {
const char *starts = s; const char *starts = s;
PyObject *v; PyObject *unicode;
int kind; int kind;
void *data; void *data;
Py_ssize_t startinpos; Py_ssize_t startinpos;
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
const char *e; const char *e;
int has_error;
const unsigned char *p = (const unsigned char *)s;
const unsigned char *end = p + size;
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
@ -6993,45 +6711,18 @@ PyUnicode_DecodeASCII(const char *s,
if (size == 1 && (unsigned char)s[0] < 128) if (size == 1 && (unsigned char)s[0] < 128)
return get_latin1_char((unsigned char)s[0]); return get_latin1_char((unsigned char)s[0]);
has_error = 0; unicode = PyUnicode_New(size, 127);
while (p < end && !has_error) { if (unicode == NULL)
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
an explanation. */
if (!((size_t) p & LONG_PTR_MASK)) {
/* Help register allocation */
register const unsigned char *_p = p;
while (_p < aligned_end) {
unsigned long value = *(unsigned long *) _p;
if (value & ASCII_CHAR_MASK) {
has_error = 1;
break;
}
_p += SIZEOF_LONG;
}
if (_p == end)
break;
if (has_error)
break;
p = _p;
}
if (*p & 0x80) {
has_error = 1;
break;
}
else {
++p;
}
}
if (!has_error)
return unicode_fromascii((const unsigned char *)s, size);
v = PyUnicode_New(size, 127);
if (v == NULL)
goto onError; goto onError;
kind = PyUnicode_KIND(v);
data = PyUnicode_DATA(v);
outpos = 0;
e = s + size; e = s + size;
data = PyUnicode_1BYTE_DATA(unicode);
outpos = ascii_decode(s, e, (Py_UCS1 *)data);
if (outpos == size)
return unicode;
s += outpos;
kind = PyUnicode_1BYTE_KIND;
while (s < e) { while (s < e) {
register unsigned char c = (unsigned char)*s; register unsigned char c = (unsigned char)*s;
if (c < 128) { if (c < 128) {
@ -7045,21 +6736,21 @@ PyUnicode_DecodeASCII(const char *s,
errors, &errorHandler, errors, &errorHandler,
"ascii", "ordinal not in range(128)", "ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&v, &outpos)) &unicode, &outpos))
goto onError; goto onError;
kind = PyUnicode_KIND(v); kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(v); data = PyUnicode_DATA(unicode);
} }
} }
if (unicode_resize(&v, outpos) < 0) if (unicode_resize(&unicode, outpos) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
assert(_PyUnicode_CheckConsistency(v, 1)); assert(_PyUnicode_CheckConsistency(unicode, 1));
return v; return unicode;
onError: onError:
Py_XDECREF(v); Py_XDECREF(unicode);
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
return NULL; return NULL;