mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Reverted changeset b72c5573c5e7 (issue #15027).
This commit is contained in:
parent
3cceb38486
commit
3079328d29
4 changed files with 62 additions and 133 deletions
|
@ -1213,9 +1213,7 @@ Other Improvements
|
||||||
Significant Optimizations
|
Significant Optimizations
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
* The UTF-32 decoder is now 3x to 4x faster. The UTF-32 encoder is now 1.6x
|
* The UTF-32 decoder is now 3x to 4x faster.
|
||||||
to 3.5x faster. (Contributed by Serhiy Storchaka in :issue:`14625` and
|
|
||||||
:issue:`15027`.)
|
|
||||||
|
|
||||||
* The cost of hash collisions for sets is now reduced. Each hash table
|
* The cost of hash collisions for sets is now reduced. Each hash table
|
||||||
probe now checks a series of consecutive, adjacent key/hash pairs before
|
probe now checks a series of consecutive, adjacent key/hash pairs before
|
||||||
|
|
|
@ -10,8 +10,6 @@ Release date: 2014-01-05
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
- Issue #15027: Rewrite the UTF-32 encoder. It is now 1.6x to 3.5x faster.
|
|
||||||
|
|
||||||
- Issue #17432: Drop UCS2 from names of Unicode functions in python3.def.
|
- Issue #17432: Drop UCS2 from names of Unicode functions in python3.def.
|
||||||
|
|
||||||
- Issue #19526: Exclude all new API from the stable ABI. Exceptions can be
|
- Issue #19526: Exclude all new API from the stable ABI. Exceptions can be
|
||||||
|
|
|
@ -718,93 +718,6 @@ STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
|
||||||
return len - (end - in + 1);
|
return len - (end - in + 1);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if STRINGLIB_SIZEOF_CHAR == 1
|
|
||||||
# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
|
|
||||||
#elif STRINGLIB_SIZEOF_CHAR == 2
|
|
||||||
# define SWAB4(CH, tmp) (tmp = (CH), \
|
|
||||||
((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
|
|
||||||
/* high bytes are zero */
|
|
||||||
#else
|
|
||||||
# define SWAB4(CH, tmp) (tmp = (CH), \
|
|
||||||
tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
|
|
||||||
((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
|
|
||||||
#endif
|
|
||||||
Py_LOCAL_INLINE(Py_ssize_t)
|
|
||||||
STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
|
|
||||||
Py_ssize_t len,
|
|
||||||
PY_UINT32_T **outptr,
|
|
||||||
int native_ordering)
|
|
||||||
{
|
|
||||||
PY_UINT32_T *out = *outptr;
|
|
||||||
const STRINGLIB_CHAR *end = in + len;
|
|
||||||
if (native_ordering) {
|
|
||||||
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
||||||
while (in < unrolled_end) {
|
|
||||||
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
||||||
/* check if any character is a surrogate character */
|
|
||||||
if (((in[0] ^ 0xd800) &
|
|
||||||
(in[1] ^ 0xd800) &
|
|
||||||
(in[2] ^ 0xd800) &
|
|
||||||
(in[3] ^ 0xd800) & 0xf800) == 0)
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
out[0] = in[0];
|
|
||||||
out[1] = in[1];
|
|
||||||
out[2] = in[2];
|
|
||||||
out[3] = in[3];
|
|
||||||
in += 4; out += 4;
|
|
||||||
}
|
|
||||||
while (in < end) {
|
|
||||||
Py_UCS4 ch;
|
|
||||||
ch = *in++;
|
|
||||||
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
||||||
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
|
||||||
/* reject surrogate characters (U+DC800-U+DFFF) */
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
*out++ = ch;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
||||||
while (in < unrolled_end) {
|
|
||||||
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
||||||
Py_UCS4 ch1, ch2, ch3, ch4;
|
|
||||||
/* check if any character is a surrogate character */
|
|
||||||
if (((in[0] ^ 0xd800) &
|
|
||||||
(in[1] ^ 0xd800) &
|
|
||||||
(in[2] ^ 0xd800) &
|
|
||||||
(in[3] ^ 0xd800) & 0xf800) == 0)
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
out[0] = SWAB4(in[0], ch1);
|
|
||||||
out[1] = SWAB4(in[1], ch2);
|
|
||||||
out[2] = SWAB4(in[2], ch3);
|
|
||||||
out[3] = SWAB4(in[3], ch4);
|
|
||||||
in += 4; out += 4;
|
|
||||||
}
|
|
||||||
while (in < end) {
|
|
||||||
Py_UCS4 ch = *in++;
|
|
||||||
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
||||||
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
|
||||||
/* reject surrogate characters (U+DC800-U+DFFF) */
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
*out++ = SWAB4(ch, ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*outptr = out;
|
|
||||||
return len;
|
|
||||||
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
||||||
fail:
|
|
||||||
*outptr = out;
|
|
||||||
return len - (end - in + 1);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#undef SWAB4
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* STRINGLIB_IS_UNICODE */
|
#endif /* STRINGLIB_IS_UNICODE */
|
||||||
|
|
|
@ -5085,22 +5085,32 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
int byteorder)
|
int byteorder)
|
||||||
{
|
{
|
||||||
enum PyUnicode_Kind kind;
|
int kind;
|
||||||
const void *data;
|
void *data;
|
||||||
Py_ssize_t len;
|
Py_ssize_t len;
|
||||||
PyObject *v;
|
PyObject *v;
|
||||||
PY_UINT32_T *out;
|
unsigned char *p;
|
||||||
|
Py_ssize_t nsize, i;
|
||||||
|
/* Offsets from p for storing byte pairs in the right order. */
|
||||||
#if PY_LITTLE_ENDIAN
|
#if PY_LITTLE_ENDIAN
|
||||||
int native_ordering = byteorder <= 0;
|
int iorder[] = {0, 1, 2, 3};
|
||||||
#else
|
#else
|
||||||
int native_ordering = byteorder >= 0;
|
int iorder[] = {3, 2, 1, 0};
|
||||||
#endif
|
#endif
|
||||||
const char *encoding;
|
const char *encoding;
|
||||||
Py_ssize_t nsize, pos;
|
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
PyObject *rep = NULL;
|
PyObject *rep = NULL;
|
||||||
|
|
||||||
|
#define STORECHAR(CH) \
|
||||||
|
do { \
|
||||||
|
p[iorder[3]] = ((CH) >> 24) & 0xff; \
|
||||||
|
p[iorder[2]] = ((CH) >> 16) & 0xff; \
|
||||||
|
p[iorder[1]] = ((CH) >> 8) & 0xff; \
|
||||||
|
p[iorder[0]] = (CH) & 0xff; \
|
||||||
|
p += 4; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
if (!PyUnicode_Check(str)) {
|
if (!PyUnicode_Check(str)) {
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -5111,53 +5121,59 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
data = PyUnicode_DATA(str);
|
data = PyUnicode_DATA(str);
|
||||||
len = PyUnicode_GET_LENGTH(str);
|
len = PyUnicode_GET_LENGTH(str);
|
||||||
|
|
||||||
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
|
|
||||||
return PyErr_NoMemory();
|
|
||||||
nsize = len + (byteorder == 0);
|
nsize = len + (byteorder == 0);
|
||||||
|
if (nsize > PY_SSIZE_T_MAX / 4)
|
||||||
|
return PyErr_NoMemory();
|
||||||
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
|
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
/* output buffer is 4-bytes aligned */
|
p = (unsigned char *)PyBytes_AS_STRING(v);
|
||||||
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
|
|
||||||
out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
|
|
||||||
if (byteorder == 0)
|
if (byteorder == 0)
|
||||||
*out++ = 0xFEFF;
|
STORECHAR(0xFEFF);
|
||||||
if (len == 0)
|
if (len == 0)
|
||||||
goto done;
|
return v;
|
||||||
|
|
||||||
if (byteorder == -1)
|
if (byteorder == -1) {
|
||||||
|
/* force LE */
|
||||||
|
iorder[0] = 0;
|
||||||
|
iorder[1] = 1;
|
||||||
|
iorder[2] = 2;
|
||||||
|
iorder[3] = 3;
|
||||||
encoding = "utf-32-le";
|
encoding = "utf-32-le";
|
||||||
else if (byteorder == 1)
|
}
|
||||||
|
else if (byteorder == 1) {
|
||||||
|
/* force BE */
|
||||||
|
iorder[0] = 3;
|
||||||
|
iorder[1] = 2;
|
||||||
|
iorder[2] = 1;
|
||||||
|
iorder[3] = 0;
|
||||||
encoding = "utf-32-be";
|
encoding = "utf-32-be";
|
||||||
|
}
|
||||||
else
|
else
|
||||||
encoding = "utf-32";
|
encoding = "utf-32";
|
||||||
|
|
||||||
if (kind == PyUnicode_1BYTE_KIND) {
|
if (kind == PyUnicode_1BYTE_KIND) {
|
||||||
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
|
for (i = 0; i < len; i++)
|
||||||
goto done;
|
STORECHAR(PyUnicode_READ(kind, data, i));
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
pos = 0;
|
for (i = 0; i < len;) {
|
||||||
while (pos < len) {
|
|
||||||
Py_ssize_t repsize, moreunits;
|
Py_ssize_t repsize, moreunits;
|
||||||
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
if (kind == PyUnicode_2BYTE_KIND) {
|
i++;
|
||||||
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
|
assert(ch <= MAX_UNICODE);
|
||||||
&out, native_ordering);
|
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||||
|
STORECHAR(ch);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
assert(kind == PyUnicode_4BYTE_KIND);
|
|
||||||
pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
|
|
||||||
&out, native_ordering);
|
|
||||||
}
|
|
||||||
if (pos == len)
|
|
||||||
break;
|
|
||||||
|
|
||||||
rep = unicode_encode_call_errorhandler(
|
rep = unicode_encode_call_errorhandler(
|
||||||
errors, &errorHandler,
|
errors, &errorHandler,
|
||||||
encoding, "surrogates not allowed",
|
encoding, "surrogates not allowed",
|
||||||
str, &exc, pos, pos + 1, &pos);
|
str, &exc, i-1, i, &i);
|
||||||
|
|
||||||
if (!rep)
|
if (!rep)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
|
@ -5165,7 +5181,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
repsize = PyBytes_GET_SIZE(rep);
|
repsize = PyBytes_GET_SIZE(rep);
|
||||||
if (repsize & 3) {
|
if (repsize & 3) {
|
||||||
raise_encode_exception(&exc, encoding,
|
raise_encode_exception(&exc, encoding,
|
||||||
str, pos - 1, pos,
|
str, i - 1, i,
|
||||||
"surrogates not allowed");
|
"surrogates not allowed");
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
@ -5178,7 +5194,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
|
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
|
||||||
if (!PyUnicode_IS_ASCII(rep)) {
|
if (!PyUnicode_IS_ASCII(rep)) {
|
||||||
raise_encode_exception(&exc, encoding,
|
raise_encode_exception(&exc, encoding,
|
||||||
str, pos - 1, pos,
|
str, i - 1, i,
|
||||||
"surrogates not allowed");
|
"surrogates not allowed");
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
@ -5186,7 +5202,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
|
|
||||||
/* four bytes are reserved for each surrogate */
|
/* four bytes are reserved for each surrogate */
|
||||||
if (moreunits > 1) {
|
if (moreunits > 1) {
|
||||||
Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
|
Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
|
||||||
Py_ssize_t morebytes = 4 * (moreunits - 1);
|
Py_ssize_t morebytes = 4 * (moreunits - 1);
|
||||||
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
|
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
|
||||||
/* integer overflow */
|
/* integer overflow */
|
||||||
|
@ -5195,16 +5211,20 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
}
|
}
|
||||||
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
|
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
|
||||||
goto error;
|
goto error;
|
||||||
out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
|
p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (PyBytes_Check(rep)) {
|
if (PyBytes_Check(rep)) {
|
||||||
Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
|
Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
|
||||||
out += moreunits;
|
p += repsize;
|
||||||
} else /* rep is unicode */ {
|
} else /* rep is unicode */ {
|
||||||
|
const Py_UCS1 *repdata;
|
||||||
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||||
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
|
repdata = PyUnicode_1BYTE_DATA(rep);
|
||||||
&out, native_ordering);
|
while (repsize--) {
|
||||||
|
Py_UCS4 ch = *repdata++;
|
||||||
|
STORECHAR(ch);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Py_CLEAR(rep);
|
Py_CLEAR(rep);
|
||||||
|
@ -5213,12 +5233,11 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
/* Cut back to size actually needed. This is necessary for, for example,
|
/* Cut back to size actually needed. This is necessary for, for example,
|
||||||
encoding of a string containing isolated surrogates and the 'ignore'
|
encoding of a string containing isolated surrogates and the 'ignore'
|
||||||
handler is used. */
|
handler is used. */
|
||||||
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
|
nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
|
||||||
if (nsize != PyBytes_GET_SIZE(v))
|
if (nsize != PyBytes_GET_SIZE(v))
|
||||||
_PyBytes_Resize(&v, nsize);
|
_PyBytes_Resize(&v, nsize);
|
||||||
Py_XDECREF(errorHandler);
|
Py_XDECREF(errorHandler);
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
done:
|
|
||||||
return v;
|
return v;
|
||||||
error:
|
error:
|
||||||
Py_XDECREF(rep);
|
Py_XDECREF(rep);
|
||||||
|
@ -5226,6 +5245,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
Py_XDECREF(v);
|
Py_XDECREF(v);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
#undef STORECHAR
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue