mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Add interning of unicode strings by copying the functionality from
stringobject.c. Intern "True" and "False" in bool_repr() again as it was in the 8bit string era.
This commit is contained in:
parent
34a042d301
commit
1680713e52
5 changed files with 158 additions and 7 deletions
|
@ -48,10 +48,6 @@ typedef struct {
|
||||||
*/
|
*/
|
||||||
} PyStringObject;
|
} PyStringObject;
|
||||||
|
|
||||||
#define SSTATE_NOT_INTERNED 0
|
|
||||||
#define SSTATE_INTERNED_MORTAL 1
|
|
||||||
#define SSTATE_INTERNED_IMMORTAL 2
|
|
||||||
|
|
||||||
PyAPI_DATA(PyTypeObject) PyBaseString_Type;
|
PyAPI_DATA(PyTypeObject) PyBaseString_Type;
|
||||||
PyAPI_DATA(PyTypeObject) PyString_Type;
|
PyAPI_DATA(PyTypeObject) PyString_Type;
|
||||||
|
|
||||||
|
|
|
@ -390,6 +390,9 @@ typedef struct {
|
||||||
Py_ssize_t length; /* Length of raw Unicode data in buffer */
|
Py_ssize_t length; /* Length of raw Unicode data in buffer */
|
||||||
Py_UNICODE *str; /* Raw Unicode buffer */
|
Py_UNICODE *str; /* Raw Unicode buffer */
|
||||||
long hash; /* Hash value; -1 if not set */
|
long hash; /* Hash value; -1 if not set */
|
||||||
|
int state; /* != 0 if interned. In this case the two
|
||||||
|
* references from the dictionary to this object
|
||||||
|
* are *not* counted in ob_refcnt. */
|
||||||
PyObject *defenc; /* (Default) Encoded version as Python
|
PyObject *defenc; /* (Default) Encoded version as Python
|
||||||
string, or NULL; this is used for
|
string, or NULL; this is used for
|
||||||
implementing the buffer protocol */
|
implementing the buffer protocol */
|
||||||
|
@ -397,6 +400,10 @@ typedef struct {
|
||||||
|
|
||||||
PyAPI_DATA(PyTypeObject) PyUnicode_Type;
|
PyAPI_DATA(PyTypeObject) PyUnicode_Type;
|
||||||
|
|
||||||
|
#define SSTATE_NOT_INTERNED 0
|
||||||
|
#define SSTATE_INTERNED_MORTAL 1
|
||||||
|
#define SSTATE_INTERNED_IMMORTAL 2
|
||||||
|
|
||||||
#define PyUnicode_Check(op) \
|
#define PyUnicode_Check(op) \
|
||||||
PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS)
|
PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS)
|
||||||
#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
|
#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
|
||||||
|
@ -529,6 +536,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
|
||||||
PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
|
PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
|
||||||
PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
|
PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
|
||||||
|
|
||||||
|
PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
|
||||||
|
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
|
||||||
|
PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
|
||||||
|
PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
|
||||||
|
|
||||||
|
/* Use only if you know it's a string */
|
||||||
|
#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
|
||||||
|
|
||||||
/* --- wchar_t support for platforms which support it --------------------- */
|
/* --- wchar_t support for platforms which support it --------------------- */
|
||||||
|
|
||||||
#ifdef HAVE_WCHAR_H
|
#ifdef HAVE_WCHAR_H
|
||||||
|
|
|
@ -521,7 +521,7 @@ Py_Main(int argc, char **argv)
|
||||||
#ifdef __INSURE__
|
#ifdef __INSURE__
|
||||||
/* Insure++ is a memory analysis tool that aids in discovering
|
/* Insure++ is a memory analysis tool that aids in discovering
|
||||||
* memory leaks and other memory problems. On Python exit, the
|
* memory leaks and other memory problems. On Python exit, the
|
||||||
* interned string dictionary is flagged as being in use at exit
|
* interned string dictionaries are flagged as being in use at exit
|
||||||
* (which it is). Under normal circumstances, this is fine because
|
* (which it is). Under normal circumstances, this is fine because
|
||||||
* the memory will be automatically reclaimed by the system. Under
|
* the memory will be automatically reclaimed by the system. Under
|
||||||
* memory debugging, it's a huge source of useless noise, so we
|
* memory debugging, it's a huge source of useless noise, so we
|
||||||
|
@ -529,6 +529,7 @@ Py_Main(int argc, char **argv)
|
||||||
* reports. -baw
|
* reports. -baw
|
||||||
*/
|
*/
|
||||||
_Py_ReleaseInternedStrings();
|
_Py_ReleaseInternedStrings();
|
||||||
|
_Py_ReleaseInternedUnicodeStrings();
|
||||||
#endif /* __INSURE__ */
|
#endif /* __INSURE__ */
|
||||||
|
|
||||||
return sts;
|
return sts;
|
||||||
|
|
|
@ -24,10 +24,10 @@ bool_repr(PyObject *self)
|
||||||
|
|
||||||
if (self == Py_True)
|
if (self == Py_True)
|
||||||
s = true_str ? true_str :
|
s = true_str ? true_str :
|
||||||
(true_str = PyUnicode_FromString("True"));
|
(true_str = PyUnicode_InternFromString("True"));
|
||||||
else
|
else
|
||||||
s = false_str ? false_str :
|
s = false_str ? false_str :
|
||||||
(false_str = PyUnicode_FromString("False"));
|
(false_str = PyUnicode_InternFromString("False"));
|
||||||
Py_XINCREF(s);
|
Py_XINCREF(s);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,6 +92,16 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* This dictionary holds all interned unicode strings. Note that references
|
||||||
|
to strings in this dictionary are *not* counted in the string's ob_refcnt.
|
||||||
|
When the interned string reaches a refcnt of 0 the string deallocation
|
||||||
|
function will delete the reference from this dictionary.
|
||||||
|
|
||||||
|
Another way to look at this is that to say that the actual reference
|
||||||
|
count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
|
||||||
|
*/
|
||||||
|
static PyObject *interned;
|
||||||
|
|
||||||
/* Free list for Unicode objects */
|
/* Free list for Unicode objects */
|
||||||
static PyUnicodeObject *unicode_freelist;
|
static PyUnicodeObject *unicode_freelist;
|
||||||
static int unicode_freelist_size;
|
static int unicode_freelist_size;
|
||||||
|
@ -276,6 +286,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
|
||||||
unicode->str[length] = 0;
|
unicode->str[length] = 0;
|
||||||
unicode->length = length;
|
unicode->length = length;
|
||||||
unicode->hash = -1;
|
unicode->hash = -1;
|
||||||
|
unicode->state = 0;
|
||||||
unicode->defenc = NULL;
|
unicode->defenc = NULL;
|
||||||
return unicode;
|
return unicode;
|
||||||
|
|
||||||
|
@ -288,6 +299,25 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
|
||||||
static
|
static
|
||||||
void unicode_dealloc(register PyUnicodeObject *unicode)
|
void unicode_dealloc(register PyUnicodeObject *unicode)
|
||||||
{
|
{
|
||||||
|
switch (PyUnicode_CHECK_INTERNED(unicode)) {
|
||||||
|
case SSTATE_NOT_INTERNED:
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SSTATE_INTERNED_MORTAL:
|
||||||
|
/* revive dead object temporarily for DelItem */
|
||||||
|
unicode->ob_refcnt = 3;
|
||||||
|
if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
|
||||||
|
Py_FatalError(
|
||||||
|
"deletion of interned unicode string failed");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SSTATE_INTERNED_IMMORTAL:
|
||||||
|
Py_FatalError("Immortal interned unicode string died.");
|
||||||
|
|
||||||
|
default:
|
||||||
|
Py_FatalError("Inconsistent interned unicode string state.");
|
||||||
|
}
|
||||||
|
|
||||||
if (PyUnicode_CheckExact(unicode) &&
|
if (PyUnicode_CheckExact(unicode) &&
|
||||||
unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
|
unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
|
||||||
/* Keep-Alive optimization */
|
/* Keep-Alive optimization */
|
||||||
|
@ -8564,6 +8594,115 @@ _PyUnicode_Fini(void)
|
||||||
unicode_freelist_size = 0;
|
unicode_freelist_size = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
PyUnicode_InternInPlace(PyObject **p)
|
||||||
|
{
|
||||||
|
register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
|
||||||
|
PyObject *t;
|
||||||
|
if (s == NULL || !PyUnicode_Check(s))
|
||||||
|
Py_FatalError(
|
||||||
|
"PyUnicode_InternInPlace: unicode strings only please!");
|
||||||
|
/* If it's a subclass, we don't really know what putting
|
||||||
|
it in the interned dict might do. */
|
||||||
|
if (!PyUnicode_CheckExact(s))
|
||||||
|
return;
|
||||||
|
if (PyUnicode_CHECK_INTERNED(s))
|
||||||
|
return;
|
||||||
|
if (interned == NULL) {
|
||||||
|
interned = PyDict_New();
|
||||||
|
if (interned == NULL) {
|
||||||
|
PyErr_Clear(); /* Don't leave an exception */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t = PyDict_GetItem(interned, (PyObject *)s);
|
||||||
|
if (t) {
|
||||||
|
Py_INCREF(t);
|
||||||
|
Py_DECREF(*p);
|
||||||
|
*p = t;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
|
||||||
|
PyErr_Clear();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/* The two references in interned are not counted by refcnt.
|
||||||
|
The deallocator will take care of this */
|
||||||
|
s->ob_refcnt -= 2;
|
||||||
|
PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
PyUnicode_InternImmortal(PyObject **p)
|
||||||
|
{
|
||||||
|
PyUnicode_InternInPlace(p);
|
||||||
|
if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
|
||||||
|
PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
|
||||||
|
Py_INCREF(*p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *
|
||||||
|
PyUnicode_InternFromString(const char *cp)
|
||||||
|
{
|
||||||
|
PyObject *s = PyUnicode_FromString(cp);
|
||||||
|
if (s == NULL)
|
||||||
|
return NULL;
|
||||||
|
PyUnicode_InternInPlace(&s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
void _Py_ReleaseInternedUnicodeStrings(void)
|
||||||
|
{
|
||||||
|
PyObject *keys;
|
||||||
|
PyUnicodeObject *s;
|
||||||
|
Py_ssize_t i, n;
|
||||||
|
Py_ssize_t immortal_size = 0, mortal_size = 0;
|
||||||
|
|
||||||
|
if (interned == NULL || !PyDict_Check(interned))
|
||||||
|
return;
|
||||||
|
keys = PyDict_Keys(interned);
|
||||||
|
if (keys == NULL || !PyList_Check(keys)) {
|
||||||
|
PyErr_Clear();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
|
||||||
|
detector, interned unicode strings are not forcibly deallocated;
|
||||||
|
rather, we give them their stolen references back, and then clear
|
||||||
|
and DECREF the interned dict. */
|
||||||
|
|
||||||
|
n = PyList_GET_SIZE(keys);
|
||||||
|
fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
|
||||||
|
n);
|
||||||
|
for (i = 0; i < n; i++) {
|
||||||
|
s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
|
||||||
|
switch (s->state) {
|
||||||
|
case SSTATE_NOT_INTERNED:
|
||||||
|
/* XXX Shouldn't happen */
|
||||||
|
break;
|
||||||
|
case SSTATE_INTERNED_IMMORTAL:
|
||||||
|
s->ob_refcnt += 1;
|
||||||
|
immortal_size += s->length;
|
||||||
|
break;
|
||||||
|
case SSTATE_INTERNED_MORTAL:
|
||||||
|
s->ob_refcnt += 2;
|
||||||
|
mortal_size += s->length;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
Py_FatalError("Inconsistent interned string state.");
|
||||||
|
}
|
||||||
|
s->state = SSTATE_NOT_INTERNED;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "total size of all interned strings: "
|
||||||
|
"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
|
||||||
|
"mortal/immortal\n", mortal_size, immortal_size);
|
||||||
|
Py_DECREF(keys);
|
||||||
|
PyDict_Clear(interned);
|
||||||
|
Py_DECREF(interned);
|
||||||
|
interned = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/********************* Unicode Iterator **************************/
|
/********************* Unicode Iterator **************************/
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue