mirror of
https://github.com/python/cpython.git
synced 2025-10-21 22:22:48 +00:00
Speedup the ASCII decoder
It is faster for long string and a little bit faster for short strings, benchmark on Linux 32 bits, Intel Core i5 @ 3.33GHz: ./python -m timeit 'x=b"a"' 'x.decode("ascii")' ./python -m timeit 'x=b"x"*80' 'x.decode("ascii")' ./python -m timeit 'x=b"abc"*4096' 'x.decode("ascii")' length | before | after -------+------------+----------- 1 | 0.234 usec | 0.229 usec 80 | 0.381 usec | 0.357 usec 12,288 | 11.2 usec | 3.01 usec
This commit is contained in:
parent
00b2c86d09
commit
702c734395
1 changed files with 52 additions and 26 deletions
|
@ -1514,6 +1514,16 @@ PyUnicode_FromString(const char *u)
|
||||||
return PyUnicode_FromStringAndSize(u, size);
|
return PyUnicode_FromStringAndSize(u, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
unicode_fromascii(const unsigned char* u, Py_ssize_t size)
|
||||||
|
{
|
||||||
|
PyObject *res = PyUnicode_New(size, 127);
|
||||||
|
if (!res)
|
||||||
|
return NULL;
|
||||||
|
memcpy(PyUnicode_1BYTE_DATA(res), u, size);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
|
_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
|
||||||
{
|
{
|
||||||
|
@ -6477,65 +6487,81 @@ PyUnicode_DecodeASCII(const char *s,
|
||||||
{
|
{
|
||||||
const char *starts = s;
|
const char *starts = s;
|
||||||
PyUnicodeObject *v;
|
PyUnicodeObject *v;
|
||||||
Py_UNICODE *p;
|
Py_UNICODE *u;
|
||||||
Py_ssize_t startinpos;
|
Py_ssize_t startinpos;
|
||||||
Py_ssize_t endinpos;
|
Py_ssize_t endinpos;
|
||||||
Py_ssize_t outpos;
|
Py_ssize_t outpos;
|
||||||
const char *e;
|
const char *e;
|
||||||
unsigned char* d;
|
int has_error;
|
||||||
|
const unsigned char *p = (const unsigned char *)s;
|
||||||
|
const unsigned char *end = p + size;
|
||||||
|
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
Py_ssize_t i;
|
|
||||||
|
|
||||||
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
|
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
|
||||||
if (size == 1 && *(unsigned char*)s < 128)
|
if (size == 1 && (unsigned char)s[0] < 128)
|
||||||
return PyUnicode_FromOrdinal(*(unsigned char*)s);
|
return get_latin1_char((unsigned char)s[0]);
|
||||||
|
|
||||||
/* Fast path. Assume the input actually *is* ASCII, and allocate
|
has_error = 0;
|
||||||
a single-block Unicode object with that assumption. If there is
|
while (p < end && !has_error) {
|
||||||
an error, drop the object and start over. */
|
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
|
||||||
v = (PyUnicodeObject*)PyUnicode_New(size, 127);
|
an explanation. */
|
||||||
if (v == NULL)
|
if (!((size_t) p & LONG_PTR_MASK)) {
|
||||||
goto onError;
|
/* Help register allocation */
|
||||||
d = PyUnicode_1BYTE_DATA(v);
|
register const unsigned char *_p = p;
|
||||||
for (i = 0; i < size; i++) {
|
while (_p < aligned_end) {
|
||||||
unsigned char ch = ((unsigned char*)s)[i];
|
unsigned long value = *(unsigned long *) _p;
|
||||||
if (ch < 128)
|
if (value & ASCII_CHAR_MASK) {
|
||||||
d[i] = ch;
|
has_error = 1;
|
||||||
else
|
break;
|
||||||
|
}
|
||||||
|
_p += SIZEOF_LONG;
|
||||||
|
}
|
||||||
|
if (_p == end)
|
||||||
|
break;
|
||||||
|
if (has_error)
|
||||||
|
break;
|
||||||
|
p = _p;
|
||||||
|
}
|
||||||
|
if (*p & 0x80) {
|
||||||
|
has_error = 1;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
++p;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (i == size)
|
if (!has_error)
|
||||||
return (PyObject*)v;
|
return unicode_fromascii((const unsigned char *)s, size);
|
||||||
Py_DECREF(v); /* start over */
|
|
||||||
|
|
||||||
v = _PyUnicode_New(size);
|
v = _PyUnicode_New(size);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
if (size == 0)
|
if (size == 0)
|
||||||
return (PyObject *)v;
|
return (PyObject *)v;
|
||||||
p = PyUnicode_AS_UNICODE(v);
|
u = PyUnicode_AS_UNICODE(v);
|
||||||
e = s + size;
|
e = s + size;
|
||||||
while (s < e) {
|
while (s < e) {
|
||||||
register unsigned char c = (unsigned char)*s;
|
register unsigned char c = (unsigned char)*s;
|
||||||
if (c < 128) {
|
if (c < 128) {
|
||||||
*p++ = c;
|
*u++ = c;
|
||||||
++s;
|
++s;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
startinpos = s-starts;
|
startinpos = s-starts;
|
||||||
endinpos = startinpos + 1;
|
endinpos = startinpos + 1;
|
||||||
outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
|
outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
|
||||||
if (unicode_decode_call_errorhandler(
|
if (unicode_decode_call_errorhandler(
|
||||||
errors, &errorHandler,
|
errors, &errorHandler,
|
||||||
"ascii", "ordinal not in range(128)",
|
"ascii", "ordinal not in range(128)",
|
||||||
&starts, &e, &startinpos, &endinpos, &exc, &s,
|
&starts, &e, &startinpos, &endinpos, &exc, &s,
|
||||||
&v, &outpos, &p))
|
&v, &outpos, &u))
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
|
if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
|
||||||
if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
|
if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
|
||||||
goto onError;
|
goto onError;
|
||||||
Py_XDECREF(errorHandler);
|
Py_XDECREF(errorHandler);
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue