mirror of
https://github.com/python/cpython.git
synced 2025-07-12 13:55:34 +00:00
[3.12] gh-106931: Intern Statically Allocated Strings Globally (gh-107272) (gh-110713)
We tried this before with a dict and for all interned strings. That ran into problems due to interpreter isolation. However, exclusively using a per-interpreter cache caused some inconsistency that can eliminate the benefit of interning. Here we circle back to using a global cache, but only for statically allocated strings. We also use a more-basic _Py_hashtable_t for that global cache instead of a dict.
Ideally we would only have the global cache, but the optional isolation of each interpreter's allocator means that a non-static string object must not outlive its interpreter. Thus we would have to store a copy of each such interned string in the global cache, tied to the main interpreter.
(cherry-picked from commit b72947a8d2
)
This commit is contained in:
parent
60a08e6ff2
commit
4f71f1680d
11 changed files with 4324 additions and 4186 deletions
|
@ -235,15 +235,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp)
|
|||
return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
|
||||
}
|
||||
|
||||
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
|
||||
|
||||
Py_ssize_t
|
||||
_PyUnicode_InternedSize(void)
|
||||
{
|
||||
return PyObject_Length(get_interned_dict(_PyInterpreterState_GET()));
|
||||
PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
|
||||
return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
|
||||
}
|
||||
|
||||
static Py_hash_t unicode_hash(PyObject *);
|
||||
static int unicode_compare_eq(PyObject *, PyObject *);
|
||||
|
||||
static Py_uhash_t
|
||||
hashtable_unicode_hash(const void *key)
|
||||
{
|
||||
return unicode_hash((PyObject *)key);
|
||||
}
|
||||
|
||||
static int
|
||||
hashtable_unicode_compare(const void *key1, const void *key2)
|
||||
{
|
||||
PyObject *obj1 = (PyObject *)key1;
|
||||
PyObject *obj2 = (PyObject *)key2;
|
||||
if (obj1 != NULL && obj2 != NULL) {
|
||||
return unicode_compare_eq(obj1, obj2);
|
||||
}
|
||||
else {
|
||||
return obj1 == obj2;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
init_interned_dict(PyInterpreterState *interp)
|
||||
{
|
||||
if (_Py_IsMainInterpreter(interp)) {
|
||||
assert(INTERNED_STRINGS == NULL);
|
||||
_Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
|
||||
INTERNED_STRINGS = _Py_hashtable_new_full(
|
||||
hashtable_unicode_hash,
|
||||
hashtable_unicode_compare,
|
||||
NULL,
|
||||
NULL,
|
||||
&hashtable_alloc
|
||||
);
|
||||
if (INTERNED_STRINGS == NULL) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
assert(get_interned_dict(interp) == NULL);
|
||||
PyObject *interned = interned = PyDict_New();
|
||||
if (interned == NULL) {
|
||||
|
@ -262,6 +301,10 @@ clear_interned_dict(PyInterpreterState *interp)
|
|||
Py_DECREF(interned);
|
||||
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
|
||||
}
|
||||
if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
|
||||
_Py_hashtable_destroy(INTERNED_STRINGS);
|
||||
INTERNED_STRINGS = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#define _Py_RETURN_UNICODE_EMPTY() \
|
||||
|
@ -1222,6 +1265,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
|
|||
_PyUnicode_STATE(unicode).kind = kind;
|
||||
_PyUnicode_STATE(unicode).compact = 1;
|
||||
_PyUnicode_STATE(unicode).ascii = is_ascii;
|
||||
_PyUnicode_STATE(unicode).statically_allocated = 0;
|
||||
if (is_ascii) {
|
||||
((char*)data)[size] = 0;
|
||||
}
|
||||
|
@ -1552,7 +1596,9 @@ unicode_dealloc(PyObject *unicode)
|
|||
* we accidentally decref an immortal string out of existence. Since
|
||||
* the string is an immortal object, just re-set the reference count.
|
||||
*/
|
||||
if (PyUnicode_CHECK_INTERNED(unicode)) {
|
||||
if (PyUnicode_CHECK_INTERNED(unicode)
|
||||
|| _PyUnicode_STATE(unicode).statically_allocated)
|
||||
{
|
||||
_Py_SetImmortal(unicode);
|
||||
return;
|
||||
}
|
||||
|
@ -14502,6 +14548,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
|
|||
_PyUnicode_STATE(self).kind = kind;
|
||||
_PyUnicode_STATE(self).compact = 0;
|
||||
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
|
||||
_PyUnicode_STATE(self).statically_allocated = 0;
|
||||
_PyUnicode_UTF8_LENGTH(self) = 0;
|
||||
_PyUnicode_UTF8(self) = NULL;
|
||||
_PyUnicode_DATA_ANY(self) = NULL;
|
||||
|
@ -14725,6 +14772,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
|
|||
return;
|
||||
}
|
||||
|
||||
/* Look in the global cache first. */
|
||||
PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
|
||||
if (r != NULL && r != s) {
|
||||
Py_SETREF(*p, Py_NewRef(r));
|
||||
return;
|
||||
}
|
||||
|
||||
/* Handle statically allocated strings. */
|
||||
if (_PyUnicode_STATE(s).statically_allocated) {
|
||||
assert(_Py_IsImmortal(s));
|
||||
if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
|
||||
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Look in the per-interpreter cache. */
|
||||
PyObject *interned = get_interned_dict(interp);
|
||||
assert(interned != NULL);
|
||||
|
||||
|
@ -14740,9 +14804,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
|
|||
}
|
||||
|
||||
if (_Py_IsImmortal(s)) {
|
||||
// XXX Restrict this to the main interpreter?
|
||||
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
|
||||
return;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef Py_REF_DEBUG
|
||||
/* The reference count value excluding the 2 references from the
|
||||
interned dictionary should be excluded from the RefTotal. The
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue