mirror of
https://github.com/python/cpython.git
synced 2025-07-24 19:54:21 +00:00
Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster
Patch written by Serhiy Storchaka.
This commit is contained in:
parent
d4156c1693
commit
e64322e034
3 changed files with 73 additions and 74 deletions
|
@ -157,7 +157,7 @@ Optimizations
|
||||||
|
|
||||||
Major performance enhancements have been added:
|
Major performance enhancements have been added:
|
||||||
|
|
||||||
* None yet.
|
* The UTF-32 decoder is now 3x to 4x faster.
|
||||||
|
|
||||||
|
|
||||||
Build and C API Changes
|
Build and C API Changes
|
||||||
|
|
|
@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
|
||||||
|
written by Serhiy Storchaka.
|
||||||
|
|
||||||
- Issue #16197: Update winreg docstrings and documentation to match code.
|
- Issue #16197: Update winreg docstrings and documentation to match code.
|
||||||
Patch by Zachary Ware.
|
Patch by Zachary Ware.
|
||||||
|
|
||||||
|
|
|
@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
Py_ssize_t outpos;
|
Py_ssize_t outpos;
|
||||||
PyObject *unicode;
|
PyObject *unicode;
|
||||||
const unsigned char *q, *e;
|
const unsigned char *q, *e;
|
||||||
int bo = 0; /* assume native ordering by default */
|
int le, bo = 0; /* assume native ordering by default */
|
||||||
const char *errmsg = "";
|
const char *errmsg = "";
|
||||||
/* Offsets from q for retrieving bytes in the right order. */
|
|
||||||
#if PY_LITTLE_ENDIAN
|
|
||||||
int iorder[] = {0, 1, 2, 3};
|
|
||||||
#else
|
|
||||||
int iorder[] = {3, 2, 1, 0};
|
|
||||||
#endif
|
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
|
|
||||||
|
@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
byte order setting accordingly. In native mode, the leading BOM
|
byte order setting accordingly. In native mode, the leading BOM
|
||||||
mark is skipped, in all other modes, it is copied to the output
|
mark is skipped, in all other modes, it is copied to the output
|
||||||
stream as-is (giving a ZWNBSP character). */
|
stream as-is (giving a ZWNBSP character). */
|
||||||
if (bo == 0) {
|
if (bo == 0 && size >= 4) {
|
||||||
if (size >= 4) {
|
Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
|
||||||
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
|
|
||||||
(q[iorder[1]] << 8) | q[iorder[0]];
|
|
||||||
#if PY_LITTLE_ENDIAN
|
|
||||||
if (bom == 0x0000FEFF) {
|
if (bom == 0x0000FEFF) {
|
||||||
q += 4;
|
|
||||||
bo = -1;
|
bo = -1;
|
||||||
|
q += 4;
|
||||||
}
|
}
|
||||||
else if (bom == 0xFFFE0000) {
|
else if (bom == 0xFFFE0000) {
|
||||||
q += 4;
|
|
||||||
bo = 1;
|
bo = 1;
|
||||||
|
q += 4;
|
||||||
}
|
}
|
||||||
|
if (byteorder)
|
||||||
|
*byteorder = bo;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (q == e) {
|
||||||
|
if (consumed)
|
||||||
|
*consumed = size;
|
||||||
|
Py_INCREF(unicode_empty);
|
||||||
|
return unicode_empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef WORDS_BIGENDIAN
|
||||||
|
le = bo < 0;
|
||||||
#else
|
#else
|
||||||
if (bom == 0x0000FEFF) {
|
le = bo <= 0;
|
||||||
q += 4;
|
|
||||||
bo = 1;
|
|
||||||
}
|
|
||||||
else if (bom == 0xFFFE0000) {
|
|
||||||
q += 4;
|
|
||||||
bo = -1;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (bo == -1) {
|
unicode = PyUnicode_New((e - q + 3) / 4, 127);
|
||||||
/* force LE */
|
|
||||||
iorder[0] = 0;
|
|
||||||
iorder[1] = 1;
|
|
||||||
iorder[2] = 2;
|
|
||||||
iorder[3] = 3;
|
|
||||||
}
|
|
||||||
else if (bo == 1) {
|
|
||||||
/* force BE */
|
|
||||||
iorder[0] = 3;
|
|
||||||
iorder[1] = 2;
|
|
||||||
iorder[2] = 1;
|
|
||||||
iorder[3] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This might be one to much, because of a BOM */
|
|
||||||
unicode = PyUnicode_New((size+3)/4, 127);
|
|
||||||
if (!unicode)
|
if (!unicode)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (size == 0)
|
|
||||||
return unicode;
|
|
||||||
outpos = 0;
|
|
||||||
|
|
||||||
while (q < e) {
|
outpos = 0;
|
||||||
Py_UCS4 ch;
|
while (1) {
|
||||||
/* remaining bytes at the end? (size should be divisible by 4) */
|
Py_UCS4 ch = 0;
|
||||||
if (e-q<4) {
|
Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
|
||||||
if (consumed)
|
|
||||||
|
if (e - q >= 4) {
|
||||||
|
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
|
||||||
|
void *data = PyUnicode_DATA(unicode);
|
||||||
|
const unsigned char *last = e - 4;
|
||||||
|
if (le) {
|
||||||
|
do {
|
||||||
|
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
|
||||||
|
if (ch > maxch)
|
||||||
break;
|
break;
|
||||||
|
PyUnicode_WRITE(kind, data, outpos++, ch);
|
||||||
|
q += 4;
|
||||||
|
} while (q <= last);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
do {
|
||||||
|
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
|
||||||
|
if (ch > maxch)
|
||||||
|
break;
|
||||||
|
PyUnicode_WRITE(kind, data, outpos++, ch);
|
||||||
|
q += 4;
|
||||||
|
} while (q <= last);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch <= maxch) {
|
||||||
|
if (q == e || consumed)
|
||||||
|
break;
|
||||||
|
/* remaining bytes at the end? (size should be divisible by 4) */
|
||||||
errmsg = "truncated data";
|
errmsg = "truncated data";
|
||||||
startinpos = ((const char *)q) - starts;
|
startinpos = ((const char *)q) - starts;
|
||||||
endinpos = ((const char *)e) - starts;
|
endinpos = ((const char *)e) - starts;
|
||||||
goto utf32Error;
|
|
||||||
/* The remaining input chars are ignored if the callback
|
|
||||||
chooses to skip the input */
|
|
||||||
}
|
|
||||||
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
|
|
||||||
(q[iorder[1]] << 8) | q[iorder[0]];
|
|
||||||
|
|
||||||
if (ch >= 0x110000)
|
|
||||||
{
|
|
||||||
errmsg = "codepoint not in range(0x110000)";
|
|
||||||
startinpos = ((const char *)q)-starts;
|
|
||||||
endinpos = startinpos+4;
|
|
||||||
goto utf32Error;
|
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
if (ch < 0x110000) {
|
||||||
if (unicode_putchar(&unicode, &outpos, ch) < 0)
|
if (unicode_putchar(&unicode, &outpos, ch) < 0)
|
||||||
goto onError;
|
goto onError;
|
||||||
q += 4;
|
q += 4;
|
||||||
continue;
|
continue;
|
||||||
utf32Error:
|
}
|
||||||
|
errmsg = "codepoint not in range(0x110000)";
|
||||||
|
startinpos = ((const char *)q) - starts;
|
||||||
|
endinpos = startinpos + 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The remaining input chars are ignored if the callback
|
||||||
|
chooses to skip the input */
|
||||||
if (unicode_decode_call_errorhandler(
|
if (unicode_decode_call_errorhandler(
|
||||||
errors, &errorHandler,
|
errors, &errorHandler,
|
||||||
"utf32", errmsg,
|
"utf32", errmsg,
|
||||||
|
@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (byteorder)
|
|
||||||
*byteorder = bo;
|
|
||||||
|
|
||||||
if (consumed)
|
if (consumed)
|
||||||
*consumed = (const char *)q-starts;
|
*consumed = (const char *)q-starts;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue