[3.12] gh-106931: Intern Statically Allocated Strings Globally (gh-107272) (gh-110713)

We tried this before with a dict and for all interned strings. That ran into problems due to interpreter isolation. However, exclusively using a per-interpreter cache caused some inconsistency that can eliminate the benefit of interning. Here we circle back to using a global cache, but only for statically allocated strings. We also use a more-basic _Py_hashtable_t for that global cache instead of a dict. Ideally we would only have the global cache, but the optional isolation of each interpreter's allocator means that a non-static string object must not outlive its interpreter. Thus we would have to store a copy of each such interned string in the global cache, tied to the main interpreter. (cherry-picked from commit b72947a8d2)
2025-10-11 09:23:31 +00:00 · 2023-11-27 16:51:12 -07:00 · 2023-11-27 16:51:12 -07:00 · 4f71f1680d
commit 4f71f1680d
parent 60a08e6ff2
11 changed files with 4324 additions and 4186 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -235,15 +235,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp)
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
 }

+#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
+
 Py_ssize_t
 _PyUnicode_InternedSize(void)
 {
-    return PyObject_Length(get_interned_dict(_PyInterpreterState_GET()));
+    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
+    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
+}
+
+static Py_hash_t unicode_hash(PyObject *);
+static int unicode_compare_eq(PyObject *, PyObject *);
+
+static Py_uhash_t
+hashtable_unicode_hash(const void *key)
+{
+    return unicode_hash((PyObject *)key);
+}
+
+static int
+hashtable_unicode_compare(const void *key1, const void *key2)
+{
+    PyObject *obj1 = (PyObject *)key1;
+    PyObject *obj2 = (PyObject *)key2;
+    if (obj1 != NULL && obj2 != NULL) {
+        return unicode_compare_eq(obj1, obj2);
+    }
+    else {
+        return obj1 == obj2;
+    }
 }

 static int
 init_interned_dict(PyInterpreterState *interp)
 {
+    if (_Py_IsMainInterpreter(interp)) {
+        assert(INTERNED_STRINGS == NULL);
+        _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
+        INTERNED_STRINGS = _Py_hashtable_new_full(
+            hashtable_unicode_hash,
+            hashtable_unicode_compare,
+            NULL,
+            NULL,
+            &hashtable_alloc
+        );
+        if (INTERNED_STRINGS == NULL) {
+            return -1;
+        }
+    }
    assert(get_interned_dict(interp) == NULL);
    PyObject *interned = interned = PyDict_New();
    if (interned == NULL) {
@ -262,6 +301,10 @@ clear_interned_dict(PyInterpreterState *interp)
        Py_DECREF(interned);
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
    }
+    if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
+        _Py_hashtable_destroy(INTERNED_STRINGS);
+        INTERNED_STRINGS = NULL;
+    }
 }

 #define _Py_RETURN_UNICODE_EMPTY()   \
@ -1222,6 +1265,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    _PyUnicode_STATE(unicode).kind = kind;
    _PyUnicode_STATE(unicode).compact = 1;
    _PyUnicode_STATE(unicode).ascii = is_ascii;
+    _PyUnicode_STATE(unicode).statically_allocated = 0;
    if (is_ascii) {
        ((char*)data)[size] = 0;
    }
@ -1552,7 +1596,9 @@ unicode_dealloc(PyObject *unicode)
     * we accidentally decref an immortal string out of existence. Since
     * the string is an immortal object, just re-set the reference count.
     */
-    if (PyUnicode_CHECK_INTERNED(unicode)) {
+    if (PyUnicode_CHECK_INTERNED(unicode)
+        || _PyUnicode_STATE(unicode).statically_allocated)
+    {
        _Py_SetImmortal(unicode);
        return;
    }
@ -14502,6 +14548,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
    _PyUnicode_STATE(self).kind = kind;
    _PyUnicode_STATE(self).compact = 0;
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
+    _PyUnicode_STATE(self).statically_allocated = 0;
    _PyUnicode_UTF8_LENGTH(self) = 0;
    _PyUnicode_UTF8(self) = NULL;
    _PyUnicode_DATA_ANY(self) = NULL;
@ -14725,6 +14772,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
        return;
    }

+    /* Look in the global cache first. */
+    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
+    if (r != NULL && r != s) {
+        Py_SETREF(*p, Py_NewRef(r));
+        return;
+    }
+
+    /* Handle statically allocated strings. */
+    if (_PyUnicode_STATE(s).statically_allocated) {
+        assert(_Py_IsImmortal(s));
+        if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
+            _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
+        }
+        return;
+    }
+
+    /* Look in the per-interpreter cache. */
    PyObject *interned = get_interned_dict(interp);
    assert(interned != NULL);

@ -14740,9 +14804,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
    }

    if (_Py_IsImmortal(s)) {
+        // XXX Restrict this to the main interpreter?
        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
-       return;
+        return;
    }
+
 #ifdef Py_REF_DEBUG
    /* The reference count value excluding the 2 references from the
       interned dictionary should be excluded from the RefTotal. The