Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster

Patch written by Serhiy Storchaka.
This commit is contained in:
Victor Stinner 2012-10-30 23:12:47 +01:00
parent d4156c1693
commit e64322e034
3 changed files with 73 additions and 74 deletions

View file

@ -157,7 +157,7 @@ Optimizations
Major performance enhancements have been added: Major performance enhancements have been added:
* None yet. * The UTF-32 decoder is now 3x to 4x faster.
Build and C API Changes Build and C API Changes

View file

@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
written by Serhiy Storchaka.
- Issue #16197: Update winreg docstrings and documentation to match code. - Issue #16197: Update winreg docstrings and documentation to match code.
Patch by Zachary Ware. Patch by Zachary Ware.

View file

@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *unicode; PyObject *unicode;
const unsigned char *q, *e; const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */ int le, bo = 0; /* assume native ordering by default */
const char *errmsg = ""; const char *errmsg = "";
/* Offsets from q for retrieving bytes in the right order. */
#if PY_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
byte order setting accordingly. In native mode, the leading BOM byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */ stream as-is (giving a ZWNBSP character). */
if (bo == 0) { if (bo == 0 && size >= 4) {
if (size >= 4) { Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | if (bom == 0x0000FEFF) {
(q[iorder[1]] << 8) | q[iorder[0]]; bo = -1;
#if PY_LITTLE_ENDIAN q += 4;
if (bom == 0x0000FEFF) {
q += 4;
bo = -1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = 1;
}
#else
if (bom == 0x0000FEFF) {
q += 4;
bo = 1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = -1;
}
#endif
} }
else if (bom == 0xFFFE0000) {
bo = 1;
q += 4;
}
if (byteorder)
*byteorder = bo;
} }
if (bo == -1) { if (q == e) {
/* force LE */ if (consumed)
iorder[0] = 0; *consumed = size;
iorder[1] = 1; Py_INCREF(unicode_empty);
iorder[2] = 2; return unicode_empty;
iorder[3] = 3;
}
else if (bo == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
} }
/* This might be one to much, because of a BOM */ #ifdef WORDS_BIGENDIAN
unicode = PyUnicode_New((size+3)/4, 127); le = bo < 0;
#else
le = bo <= 0;
#endif
unicode = PyUnicode_New((e - q + 3) / 4, 127);
if (!unicode) if (!unicode)
return NULL; return NULL;
if (size == 0)
return unicode;
outpos = 0; outpos = 0;
while (1) {
Py_UCS4 ch = 0;
Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
while (q < e) { if (e - q >= 4) {
Py_UCS4 ch; enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
/* remaining bytes at the end? (size should be divisible by 4) */ void *data = PyUnicode_DATA(unicode);
if (e-q<4) { const unsigned char *last = e - 4;
if (consumed) if (le) {
do {
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
if (ch > maxch)
break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
else {
do {
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
if (ch > maxch)
break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
}
if (ch <= maxch) {
if (q == e || consumed)
break; break;
/* remaining bytes at the end? (size should be divisible by 4) */
errmsg = "truncated data"; errmsg = "truncated data";
startinpos = ((const char *)q)-starts; startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e)-starts; endinpos = ((const char *)e) - starts;
goto utf32Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
} }
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | else {
(q[iorder[1]] << 8) | q[iorder[0]]; if (ch < 0x110000) {
if (unicode_putchar(&unicode, &outpos, ch) < 0)
if (ch >= 0x110000) goto onError;
{ q += 4;
continue;
}
errmsg = "codepoint not in range(0x110000)"; errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q)-starts; startinpos = ((const char *)q) - starts;
endinpos = startinpos+4; endinpos = startinpos + 4;
goto utf32Error;
} }
if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError; /* The remaining input chars are ignored if the callback
q += 4; chooses to skip the input */
continue;
utf32Error:
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf32", errmsg, "utf32", errmsg,
@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
goto onError; goto onError;
} }
if (byteorder)
*byteorder = bo;
if (consumed) if (consumed)
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;