mirror of
https://github.com/python/cpython.git
synced 2025-11-25 21:11:09 +00:00
Speedup str[a:b] and PyUnicode_FromKindAndData
* str[a:b] doesn't scan the string for the maximum character if the string is ascii only * PyUnicode_FromKindAndData() stops if we are sure that we cannot use a shorter character type. For example, _PyUnicode_FromUCS1() stops if we have at least one character in range U+0080-U+00FF
This commit is contained in:
parent
702c734395
commit
b9275c104e
2 changed files with 51 additions and 27 deletions
|
|
@ -654,6 +654,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromString(
|
||||||
const char *u /* UTF-8 encoded string */
|
const char *u /* UTF-8 encoded string */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
|
||||||
|
Scan the string to find the maximum character. */
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
|
PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
|
||||||
int kind,
|
int kind,
|
||||||
|
|
|
||||||
|
|
@ -969,7 +969,7 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
|
||||||
|
|
||||||
if (from_kind == to_kind
|
if (from_kind == to_kind
|
||||||
/* deny latin1 => ascii */
|
/* deny latin1 => ascii */
|
||||||
&& PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
|
&& !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
|
||||||
{
|
{
|
||||||
Py_MEMCPY((char*)to_data
|
Py_MEMCPY((char*)to_data
|
||||||
+ PyUnicode_KIND_SIZE(to_kind, to_start),
|
+ PyUnicode_KIND_SIZE(to_kind, to_start),
|
||||||
|
|
@ -1013,9 +1013,7 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
|
||||||
/* check if max_char(from substring) <= max_char(to) */
|
/* check if max_char(from substring) <= max_char(to) */
|
||||||
if (from_kind > to_kind
|
if (from_kind > to_kind
|
||||||
/* latin1 => ascii */
|
/* latin1 => ascii */
|
||||||
|| (PyUnicode_IS_ASCII(to)
|
|| (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
|
||||||
&& to_kind == PyUnicode_1BYTE_KIND
|
|
||||||
&& !PyUnicode_IS_ASCII(from)))
|
|
||||||
{
|
{
|
||||||
/* slow path to check for character overflow */
|
/* slow path to check for character overflow */
|
||||||
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
|
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
|
||||||
|
|
@ -1528,15 +1526,17 @@ static PyObject*
|
||||||
_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
|
_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
|
||||||
{
|
{
|
||||||
PyObject *res;
|
PyObject *res;
|
||||||
unsigned char max = 127;
|
unsigned char max_char = 127;
|
||||||
Py_ssize_t i;
|
Py_ssize_t i;
|
||||||
|
|
||||||
|
assert(size >= 0);
|
||||||
for (i = 0; i < size; i++) {
|
for (i = 0; i < size; i++) {
|
||||||
if (u[i] & 0x80) {
|
if (u[i] & 0x80) {
|
||||||
max = 255;
|
max_char = 255;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res = PyUnicode_New(size, max);
|
res = PyUnicode_New(size, max_char);
|
||||||
if (!res)
|
if (!res)
|
||||||
return NULL;
|
return NULL;
|
||||||
memcpy(PyUnicode_1BYTE_DATA(res), u, size);
|
memcpy(PyUnicode_1BYTE_DATA(res), u, size);
|
||||||
|
|
@ -1547,15 +1547,21 @@ static PyObject*
|
||||||
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
|
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
|
||||||
{
|
{
|
||||||
PyObject *res;
|
PyObject *res;
|
||||||
Py_UCS2 max = 0;
|
Py_UCS2 max_char = 0;
|
||||||
Py_ssize_t i;
|
Py_ssize_t i;
|
||||||
for (i = 0; i < size; i++)
|
|
||||||
if (u[i] > max)
|
assert(size >= 0);
|
||||||
max = u[i];
|
for (i = 0; i < size; i++) {
|
||||||
res = PyUnicode_New(size, max);
|
if (u[i] > max_char) {
|
||||||
|
max_char = u[i];
|
||||||
|
if (max_char >= 256)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res = PyUnicode_New(size, max_char);
|
||||||
if (!res)
|
if (!res)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (max >= 256)
|
if (max_char >= 256)
|
||||||
memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
|
memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
|
||||||
else
|
else
|
||||||
for (i = 0; i < size; i++)
|
for (i = 0; i < size; i++)
|
||||||
|
|
@ -1567,15 +1573,21 @@ static PyObject*
|
||||||
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
|
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
|
||||||
{
|
{
|
||||||
PyObject *res;
|
PyObject *res;
|
||||||
Py_UCS4 max = 0;
|
Py_UCS4 max_char = 0;
|
||||||
Py_ssize_t i;
|
Py_ssize_t i;
|
||||||
for (i = 0; i < size; i++)
|
|
||||||
if (u[i] > max)
|
assert(size >= 0);
|
||||||
max = u[i];
|
for (i = 0; i < size; i++) {
|
||||||
res = PyUnicode_New(size, max);
|
if (u[i] > max_char) {
|
||||||
|
max_char = u[i];
|
||||||
|
if (max_char >= 0x10000)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res = PyUnicode_New(size, max_char);
|
||||||
if (!res)
|
if (!res)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (max >= 0x10000)
|
if (max_char >= 0x10000)
|
||||||
memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
|
memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
|
||||||
else {
|
else {
|
||||||
int kind = PyUnicode_KIND(res);
|
int kind = PyUnicode_KIND(res);
|
||||||
|
|
@ -1596,9 +1608,11 @@ PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
|
||||||
return _PyUnicode_FromUCS2(buffer, size);
|
return _PyUnicode_FromUCS2(buffer, size);
|
||||||
case PyUnicode_4BYTE_KIND:
|
case PyUnicode_4BYTE_KIND:
|
||||||
return _PyUnicode_FromUCS4(buffer, size);
|
return _PyUnicode_FromUCS4(buffer, size);
|
||||||
|
default:
|
||||||
|
assert(0 && "invalid kind");
|
||||||
|
PyErr_SetString(PyExc_SystemError, "invalid kind");
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
PyErr_SetString(PyExc_SystemError, "invalid kind");
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject*
|
PyObject*
|
||||||
|
|
@ -9383,11 +9397,12 @@ replace(PyObject *self, PyObject *str1,
|
||||||
maxchar = PyUnicode_MAX_CHAR_VALUE(self);
|
maxchar = PyUnicode_MAX_CHAR_VALUE(self);
|
||||||
/* Replacing u1 with u2 may cause a maxchar reduction in the
|
/* Replacing u1 with u2 may cause a maxchar reduction in the
|
||||||
result string. */
|
result string. */
|
||||||
mayshrink = maxchar > 127;
|
|
||||||
if (u2 > maxchar) {
|
if (u2 > maxchar) {
|
||||||
maxchar = u2;
|
maxchar = u2;
|
||||||
mayshrink = 0;
|
mayshrink = 0;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
mayshrink = maxchar > 127;
|
||||||
u = PyUnicode_New(slen, maxchar);
|
u = PyUnicode_New(slen, maxchar);
|
||||||
if (!u)
|
if (!u)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
@ -11039,11 +11054,18 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
kind = PyUnicode_KIND(self);
|
if (PyUnicode_IS_ASCII(self)) {
|
||||||
data = PyUnicode_1BYTE_DATA(self);
|
kind = PyUnicode_KIND(self);
|
||||||
return PyUnicode_FromKindAndData(kind,
|
data = PyUnicode_1BYTE_DATA(self);
|
||||||
data + PyUnicode_KIND_SIZE(kind, start),
|
return unicode_fromascii(data + start, length);
|
||||||
length);
|
}
|
||||||
|
else {
|
||||||
|
kind = PyUnicode_KIND(self);
|
||||||
|
data = PyUnicode_1BYTE_DATA(self);
|
||||||
|
return PyUnicode_FromKindAndData(kind,
|
||||||
|
data + PyUnicode_KIND_SIZE(kind, start),
|
||||||
|
length);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue