mirror of
https://github.com/python/cpython.git
synced 2025-08-03 16:39:00 +00:00
Issue #15027: Rewrite the UTF-32 encoder. It is now 1.6x to 3.5x faster.
This commit is contained in:
parent
41adc26708
commit
583a93943c
4 changed files with 133 additions and 62 deletions
|
@ -5085,32 +5085,22 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
const char *errors,
|
||||
int byteorder)
|
||||
{
|
||||
int kind;
|
||||
void *data;
|
||||
enum PyUnicode_Kind kind;
|
||||
const void *data;
|
||||
Py_ssize_t len;
|
||||
PyObject *v;
|
||||
unsigned char *p;
|
||||
Py_ssize_t nsize, i;
|
||||
/* Offsets from p for storing byte pairs in the right order. */
|
||||
PY_UINT32_T *out;
|
||||
#if PY_LITTLE_ENDIAN
|
||||
int iorder[] = {0, 1, 2, 3};
|
||||
int native_ordering = byteorder <= 0;
|
||||
#else
|
||||
int iorder[] = {3, 2, 1, 0};
|
||||
int native_ordering = byteorder >= 0;
|
||||
#endif
|
||||
const char *encoding;
|
||||
Py_ssize_t nsize, pos;
|
||||
PyObject *errorHandler = NULL;
|
||||
PyObject *exc = NULL;
|
||||
PyObject *rep = NULL;
|
||||
|
||||
#define STORECHAR(CH) \
|
||||
do { \
|
||||
p[iorder[3]] = ((CH) >> 24) & 0xff; \
|
||||
p[iorder[2]] = ((CH) >> 16) & 0xff; \
|
||||
p[iorder[1]] = ((CH) >> 8) & 0xff; \
|
||||
p[iorder[0]] = (CH) & 0xff; \
|
||||
p += 4; \
|
||||
} while(0)
|
||||
|
||||
if (!PyUnicode_Check(str)) {
|
||||
PyErr_BadArgument();
|
||||
return NULL;
|
||||
|
@ -5121,59 +5111,53 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
data = PyUnicode_DATA(str);
|
||||
len = PyUnicode_GET_LENGTH(str);
|
||||
|
||||
nsize = len + (byteorder == 0);
|
||||
if (nsize > PY_SSIZE_T_MAX / 4)
|
||||
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
|
||||
return PyErr_NoMemory();
|
||||
nsize = len + (byteorder == 0);
|
||||
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
|
||||
if (v == NULL)
|
||||
return NULL;
|
||||
|
||||
p = (unsigned char *)PyBytes_AS_STRING(v);
|
||||
/* output buffer is 4-bytes aligned */
|
||||
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
|
||||
out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
|
||||
if (byteorder == 0)
|
||||
STORECHAR(0xFEFF);
|
||||
*out++ = 0xFEFF;
|
||||
if (len == 0)
|
||||
return v;
|
||||
goto done;
|
||||
|
||||
if (byteorder == -1) {
|
||||
/* force LE */
|
||||
iorder[0] = 0;
|
||||
iorder[1] = 1;
|
||||
iorder[2] = 2;
|
||||
iorder[3] = 3;
|
||||
if (byteorder == -1)
|
||||
encoding = "utf-32-le";
|
||||
}
|
||||
else if (byteorder == 1) {
|
||||
/* force BE */
|
||||
iorder[0] = 3;
|
||||
iorder[1] = 2;
|
||||
iorder[2] = 1;
|
||||
iorder[3] = 0;
|
||||
else if (byteorder == 1)
|
||||
encoding = "utf-32-be";
|
||||
}
|
||||
else
|
||||
encoding = "utf-32";
|
||||
|
||||
if (kind == PyUnicode_1BYTE_KIND) {
|
||||
for (i = 0; i < len; i++)
|
||||
STORECHAR(PyUnicode_READ(kind, data, i));
|
||||
return v;
|
||||
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
|
||||
goto done;
|
||||
}
|
||||
|
||||
for (i = 0; i < len;) {
|
||||
pos = 0;
|
||||
while (pos < len) {
|
||||
Py_ssize_t repsize, moreunits;
|
||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||
i++;
|
||||
assert(ch <= MAX_UNICODE);
|
||||
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
STORECHAR(ch);
|
||||
continue;
|
||||
|
||||
if (kind == PyUnicode_2BYTE_KIND) {
|
||||
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
|
||||
&out, native_ordering);
|
||||
}
|
||||
else {
|
||||
assert(kind == PyUnicode_4BYTE_KIND);
|
||||
pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
|
||||
&out, native_ordering);
|
||||
}
|
||||
if (pos == len)
|
||||
break;
|
||||
|
||||
rep = unicode_encode_call_errorhandler(
|
||||
errors, &errorHandler,
|
||||
encoding, "surrogates not allowed",
|
||||
str, &exc, i-1, i, &i);
|
||||
|
||||
str, &exc, pos, pos + 1, &pos);
|
||||
if (!rep)
|
||||
goto error;
|
||||
|
||||
|
@ -5181,7 +5165,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
repsize = PyBytes_GET_SIZE(rep);
|
||||
if (repsize & 3) {
|
||||
raise_encode_exception(&exc, encoding,
|
||||
str, i - 1, i,
|
||||
str, pos - 1, pos,
|
||||
"surrogates not allowed");
|
||||
goto error;
|
||||
}
|
||||
|
@ -5194,7 +5178,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
|
||||
if (!PyUnicode_IS_ASCII(rep)) {
|
||||
raise_encode_exception(&exc, encoding,
|
||||
str, i - 1, i,
|
||||
str, pos - 1, pos,
|
||||
"surrogates not allowed");
|
||||
goto error;
|
||||
}
|
||||
|
@ -5202,7 +5186,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
|
||||
/* four bytes are reserved for each surrogate */
|
||||
if (moreunits > 1) {
|
||||
Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
|
||||
Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
|
||||
Py_ssize_t morebytes = 4 * (moreunits - 1);
|
||||
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
|
||||
/* integer overflow */
|
||||
|
@ -5211,20 +5195,16 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
}
|
||||
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
|
||||
goto error;
|
||||
p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
|
||||
out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
|
||||
}
|
||||
|
||||
if (PyBytes_Check(rep)) {
|
||||
Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
|
||||
p += repsize;
|
||||
Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
|
||||
out += moreunits;
|
||||
} else /* rep is unicode */ {
|
||||
const Py_UCS1 *repdata;
|
||||
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||
repdata = PyUnicode_1BYTE_DATA(rep);
|
||||
while (repsize--) {
|
||||
Py_UCS4 ch = *repdata++;
|
||||
STORECHAR(ch);
|
||||
}
|
||||
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
|
||||
&out, native_ordering);
|
||||
}
|
||||
|
||||
Py_CLEAR(rep);
|
||||
|
@ -5233,11 +5213,12 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
/* Cut back to size actually needed. This is necessary for, for example,
|
||||
encoding of a string containing isolated surrogates and the 'ignore'
|
||||
handler is used. */
|
||||
nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
|
||||
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
|
||||
if (nsize != PyBytes_GET_SIZE(v))
|
||||
_PyBytes_Resize(&v, nsize);
|
||||
Py_XDECREF(errorHandler);
|
||||
Py_XDECREF(exc);
|
||||
done:
|
||||
return v;
|
||||
error:
|
||||
Py_XDECREF(rep);
|
||||
|
@ -5245,7 +5226,6 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
Py_XDECREF(exc);
|
||||
Py_XDECREF(v);
|
||||
return NULL;
|
||||
#undef STORECHAR
|
||||
}
|
||||
|
||||
PyObject *
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue