mirror of
https://github.com/python/cpython.git
synced 2025-08-02 08:02:56 +00:00
Optimize unicode_subtype_new(): don't encode to wchar_t and decode from wchar_t
Rewrite unicode_subtype_new(): allocate directly the right type.
This commit is contained in:
parent
e90fe6a8f4
commit
07ac3ebd7b
2 changed files with 81 additions and 43 deletions
|
@ -1010,10 +1010,13 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
class UnicodeSubclass(str):
|
class UnicodeSubclass(str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
self.assertEqual(
|
for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
|
||||||
str(UnicodeSubclass('unicode subclass becomes unicode')),
|
subclass = UnicodeSubclass(text)
|
||||||
'unicode subclass becomes unicode'
|
self.assertEqual(str(subclass), text)
|
||||||
)
|
self.assertEqual(len(subclass), len(text))
|
||||||
|
if text == 'ascii':
|
||||||
|
self.assertEqual(subclass.encode('ascii'), b'ascii')
|
||||||
|
self.assertEqual(subclass.encode('utf-8'), b'ascii')
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
str('strings are converted to unicode'),
|
str('strings are converted to unicode'),
|
||||||
|
|
|
@ -12410,56 +12410,91 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
||||||
{
|
{
|
||||||
PyUnicodeObject *tmp, *pnew;
|
PyUnicodeObject *unicode, *self;
|
||||||
Py_ssize_t n;
|
Py_ssize_t length, char_size;
|
||||||
PyObject *err = NULL;
|
int share_wstr, share_utf8;
|
||||||
|
unsigned int kind;
|
||||||
|
void *data;
|
||||||
|
|
||||||
assert(PyType_IsSubtype(type, &PyUnicode_Type));
|
assert(PyType_IsSubtype(type, &PyUnicode_Type));
|
||||||
tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
|
|
||||||
if (tmp == NULL)
|
unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
|
||||||
|
if (unicode == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
assert(PyUnicode_Check(tmp));
|
assert(PyUnicode_Check(unicode));
|
||||||
// TODO: Verify the PyUnicode_GET_SIZE does the right thing.
|
if (PyUnicode_READY(unicode))
|
||||||
// it seems kind of strange that tp_alloc gets passed the size
|
return NULL;
|
||||||
// of the unicode string because there will follow another
|
|
||||||
// malloc.
|
self = (PyUnicodeObject *) type->tp_alloc(type, 0);
|
||||||
pnew = (PyUnicodeObject *) type->tp_alloc(type,
|
if (self == NULL) {
|
||||||
n = PyUnicode_GET_SIZE(tmp));
|
Py_DECREF(unicode);
|
||||||
if (pnew == NULL) {
|
|
||||||
Py_DECREF(tmp);
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
_PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
|
kind = PyUnicode_KIND(unicode);
|
||||||
if (_PyUnicode_WSTR(pnew) == NULL) {
|
length = PyUnicode_GET_LENGTH(unicode);
|
||||||
err = PyErr_NoMemory();
|
|
||||||
|
_PyUnicode_LENGTH(self) = length;
|
||||||
|
_PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
|
||||||
|
_PyUnicode_STATE(self).interned = 0;
|
||||||
|
_PyUnicode_STATE(self).kind = kind;
|
||||||
|
_PyUnicode_STATE(self).compact = 0;
|
||||||
|
_PyUnicode_STATE(self).ascii = 0;
|
||||||
|
_PyUnicode_STATE(self).ready = 1;
|
||||||
|
_PyUnicode_WSTR(self) = NULL;
|
||||||
|
_PyUnicode_UTF8_LENGTH(self) = 0;
|
||||||
|
_PyUnicode_UTF8(self) = NULL;
|
||||||
|
_PyUnicode_WSTR_LENGTH(self) = 0;
|
||||||
|
self->data.any = NULL;
|
||||||
|
|
||||||
|
share_utf8 = 0;
|
||||||
|
share_wstr = 0;
|
||||||
|
if (kind == PyUnicode_1BYTE_KIND) {
|
||||||
|
char_size = 1;
|
||||||
|
if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
|
||||||
|
share_utf8 = 1;
|
||||||
|
}
|
||||||
|
else if (kind == PyUnicode_2BYTE_KIND) {
|
||||||
|
char_size = 2;
|
||||||
|
if (sizeof(wchar_t) == 2)
|
||||||
|
share_wstr = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(kind == PyUnicode_4BYTE_KIND);
|
||||||
|
char_size = 4;
|
||||||
|
if (sizeof(wchar_t) == 4)
|
||||||
|
share_wstr = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ensure we won't overflow the length. */
|
||||||
|
if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
|
||||||
|
PyErr_NoMemory();
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
|
data = PyObject_MALLOC((length + 1) * char_size);
|
||||||
_PyUnicode_WSTR_LENGTH(pnew) = n;
|
if (data == NULL) {
|
||||||
_PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
|
PyErr_NoMemory();
|
||||||
_PyUnicode_STATE(pnew).interned = 0;
|
|
||||||
_PyUnicode_STATE(pnew).kind = 0;
|
|
||||||
_PyUnicode_STATE(pnew).compact = 0;
|
|
||||||
_PyUnicode_STATE(pnew).ready = 0;
|
|
||||||
_PyUnicode_STATE(pnew).ascii = 0;
|
|
||||||
pnew->data.any = NULL;
|
|
||||||
_PyUnicode_LENGTH(pnew) = 0;
|
|
||||||
pnew->_base.utf8 = NULL;
|
|
||||||
pnew->_base.utf8_length = 0;
|
|
||||||
|
|
||||||
if (PyUnicode_READY(pnew) == -1) {
|
|
||||||
PyObject_FREE(_PyUnicode_WSTR(pnew));
|
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
|
|
||||||
Py_DECREF(tmp);
|
self->data.any = data;
|
||||||
return (PyObject *)pnew;
|
if (share_utf8) {
|
||||||
|
_PyUnicode_UTF8_LENGTH(self) = length;
|
||||||
|
_PyUnicode_UTF8(self) = data;
|
||||||
|
}
|
||||||
|
if (share_wstr) {
|
||||||
|
_PyUnicode_WSTR_LENGTH(self) = length;
|
||||||
|
_PyUnicode_WSTR(self) = (wchar_t *)data;
|
||||||
|
}
|
||||||
|
|
||||||
onError:
|
Py_MEMCPY(data, PyUnicode_DATA(unicode),
|
||||||
_Py_ForgetReference((PyObject *)pnew);
|
PyUnicode_KIND_SIZE(kind, length + 1));
|
||||||
PyObject_Del(pnew);
|
Py_DECREF(unicode);
|
||||||
Py_DECREF(tmp);
|
return (PyObject *)self;
|
||||||
return err;
|
|
||||||
|
onError:
|
||||||
|
Py_DECREF(unicode);
|
||||||
|
Py_DECREF(self);
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(unicode_doc,
|
PyDoc_STRVAR(unicode_doc,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue