Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster

Patch written by Serhiy Storchaka.
This commit is contained in:
Victor Stinner 2012-10-30 23:12:47 +01:00
parent d4156c1693
commit e64322e034
3 changed files with 73 additions and 74 deletions

View file

@ -157,7 +157,7 @@ Optimizations
Major performance enhancements have been added: Major performance enhancements have been added:
* None yet. * The UTF-32 decoder is now 3x to 4x faster.
Build and C API Changes Build and C API Changes

View file

@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
written by Serhiy Storchaka.
- Issue #16197: Update winreg docstrings and documentation to match code. - Issue #16197: Update winreg docstrings and documentation to match code.
Patch by Zachary Ware. Patch by Zachary Ware.

View file

@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *unicode; PyObject *unicode;
const unsigned char *q, *e; const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */ int le, bo = 0; /* assume native ordering by default */
const char *errmsg = ""; const char *errmsg = "";
/* Offsets from q for retrieving bytes in the right order. */
#if PY_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
byte order setting accordingly. In native mode, the leading BOM byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */ stream as-is (giving a ZWNBSP character). */
if (bo == 0) { if (bo == 0 && size >= 4) {
if (size >= 4) { Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
#if PY_LITTLE_ENDIAN
if (bom == 0x0000FEFF) { if (bom == 0x0000FEFF) {
q += 4;
bo = -1; bo = -1;
q += 4;
} }
else if (bom == 0xFFFE0000) { else if (bom == 0xFFFE0000) {
q += 4;
bo = 1; bo = 1;
q += 4;
} }
if (byteorder)
*byteorder = bo;
}
if (q == e) {
if (consumed)
*consumed = size;
Py_INCREF(unicode_empty);
return unicode_empty;
}
#ifdef WORDS_BIGENDIAN
le = bo < 0;
#else #else
if (bom == 0x0000FEFF) { le = bo <= 0;
q += 4;
bo = 1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = -1;
}
#endif #endif
}
}
if (bo == -1) { unicode = PyUnicode_New((e - q + 3) / 4, 127);
/* force LE */
iorder[0] = 0;
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
}
else if (bo == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
/* This might be one to much, because of a BOM */
unicode = PyUnicode_New((size+3)/4, 127);
if (!unicode) if (!unicode)
return NULL; return NULL;
if (size == 0)
return unicode;
outpos = 0;
while (q < e) { outpos = 0;
Py_UCS4 ch; while (1) {
/* remaining bytes at the end? (size should be divisible by 4) */ Py_UCS4 ch = 0;
if (e-q<4) { Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
if (consumed)
if (e - q >= 4) {
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
const unsigned char *last = e - 4;
if (le) {
do {
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
if (ch > maxch)
break; break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
else {
do {
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
if (ch > maxch)
break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
}
if (ch <= maxch) {
if (q == e || consumed)
break;
/* remaining bytes at the end? (size should be divisible by 4) */
errmsg = "truncated data"; errmsg = "truncated data";
startinpos = ((const char *)q) - starts; startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) - starts; endinpos = ((const char *)e) - starts;
goto utf32Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
if (ch >= 0x110000)
{
errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q)-starts;
endinpos = startinpos+4;
goto utf32Error;
} }
else {
if (ch < 0x110000) {
if (unicode_putchar(&unicode, &outpos, ch) < 0) if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError; goto onError;
q += 4; q += 4;
continue; continue;
utf32Error: }
errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q) - starts;
endinpos = startinpos + 4;
}
/* The remaining input chars are ignored if the callback
chooses to skip the input */
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf32", errmsg, "utf32", errmsg,
@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
goto onError; goto onError;
} }
if (byteorder)
*byteorder = bo;
if (consumed) if (consumed)
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;