[3.12] gh-106931: Intern Statically Allocated Strings Globally (gh-107272) (gh-110713)

We tried this before with a dict and for all interned strings.  That ran into problems due to interpreter isolation.  However, exclusively using a per-interpreter cache caused some inconsistency that can eliminate the benefit of interning.  Here we circle back to using a global cache, but only for statically allocated strings.  We also use a more-basic _Py_hashtable_t for that global cache instead of a dict.

Ideally we would only have the global cache, but the optional isolation of each interpreter's allocator means that a non-static string object must not outlive its interpreter.  Thus we would have to store a copy of each such interned string in the global cache, tied to the main interpreter.

(cherry-picked from commit b72947a8d2)
This commit is contained in:
Eric Snow 2023-11-27 16:51:12 -07:00 committed by GitHub
parent 60a08e6ff2
commit 4f71f1680d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 4324 additions and 4186 deletions

File diff suppressed because it is too large Load diff

View file

@ -140,9 +140,11 @@ typedef struct {
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */ set, use the PyASCIIObject structure. */
unsigned int ascii:1; unsigned int ascii:1;
/* The object is statically allocated. */
unsigned int statically_allocated:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to /* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */ 4 bytes (see issue #19537 on m68k). */
unsigned int :25; unsigned int :24;
} state; } state;
} PyASCIIObject; } PyASCIIObject;

View file

@ -8,6 +8,7 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define" # error "this header requires Py_BUILD_CORE define"
#endif #endif
#include "pycore_hashtable.h" // _Py_hashtable_t
#include "pycore_gc.h" // PyGC_Head #include "pycore_gc.h" // PyGC_Head
#include "pycore_global_strings.h" // struct _Py_global_strings #include "pycore_global_strings.h" // struct _Py_global_strings
#include "pycore_hamt.h" // PyHamtNode_Bitmap #include "pycore_hamt.h" // PyHamtNode_Bitmap
@ -28,6 +29,11 @@ extern "C" {
#define _Py_SINGLETON(NAME) \ #define _Py_SINGLETON(NAME) \
_Py_GLOBAL_OBJECT(singletons.NAME) _Py_GLOBAL_OBJECT(singletons.NAME)
struct _Py_cached_objects {
// XXX We could statically allocate the hashtable.
_Py_hashtable_t *interned_strings;
};
struct _Py_static_objects { struct _Py_static_objects {
struct { struct {
/* Small integers are preallocated in this array so that they /* Small integers are preallocated in this array so that they

View file

@ -106,6 +106,7 @@ PyAPI_FUNC(int) _Py_hashtable_foreach(
void *user_data); void *user_data);
PyAPI_FUNC(size_t) _Py_hashtable_size(const _Py_hashtable_t *ht); PyAPI_FUNC(size_t) _Py_hashtable_size(const _Py_hashtable_t *ht);
PyAPI_FUNC(size_t) _Py_hashtable_len(const _Py_hashtable_t *ht);
/* Add a new entry to the hash. The key must not be present in the hash table. /* Add a new entry to the hash. The key must not be present in the hash table.
Return 0 on success, -1 on memory error. */ Return 0 on success, -1 on memory error. */

View file

@ -161,6 +161,7 @@ typedef struct pyruntimestate {
/* All the objects that are shared by the runtime's interpreters. */ /* All the objects that are shared by the runtime's interpreters. */
struct _Py_static_objects static_objects; struct _Py_static_objects static_objects;
struct _Py_cached_objects cached_objects;
/* The value to use for sys.path[0] in new subinterpreters. /* The value to use for sys.path[0] in new subinterpreters.
Normally this would be part of the PyConfig struct. However, Normally this would be part of the PyConfig struct. However,

View file

@ -164,6 +164,7 @@ extern PyTypeObject _PyExc_MemoryError;
.kind = 1, \ .kind = 1, \
.compact = 1, \ .compact = 1, \
.ascii = (ASCII), \ .ascii = (ASCII), \
.statically_allocated = 1, \
}, \ }, \
} }
#define _PyASCIIObject_INIT(LITERAL) \ #define _PyASCIIObject_INIT(LITERAL) \

View file

@ -15,6 +15,7 @@ from test.support import os_helper
from test.support.script_helper import assert_python_ok, assert_python_failure from test.support.script_helper import assert_python_ok, assert_python_failure
from test.support import threading_helper from test.support import threading_helper
from test.support import import_helper from test.support import import_helper
from test.support import interpreters
import textwrap import textwrap
import unittest import unittest
import warnings import warnings
@ -699,6 +700,33 @@ class SysModuleTest(unittest.TestCase):
self.assertRaises(TypeError, sys.intern, S("abc")) self.assertRaises(TypeError, sys.intern, S("abc"))
def test_subinterp_intern_dynamically_allocated(self):
s = "never interned before" + str(random.randrange(0, 10**9))
t = sys.intern(s)
self.assertIs(t, s)
interp = interpreters.create()
interp.run(textwrap.dedent(f'''
import sys
t = sys.intern({s!r})
assert id(t) != {id(s)}, (id(t), {id(s)})
assert id(t) != {id(t)}, (id(t), {id(t)})
'''))
def test_subinterp_intern_statically_allocated(self):
# See Tools/build/generate_global_objects.py for the list
# of strings that are always statically allocated.
s = '__init__'
t = sys.intern(s)
print('------------------------')
interp = interpreters.create()
interp.run(textwrap.dedent(f'''
import sys
t = sys.intern({s!r})
assert id(t) == {id(t)}, (id(t), {id(t)})
'''))
def test_sys_flags(self): def test_sys_flags(self):
self.assertTrue(sys.flags) self.assertTrue(sys.flags)
attrs = ("debug", attrs = ("debug",

View file

@ -0,0 +1,3 @@
Statically allocated string objects are now interned globally instead of
per-interpreter. This fixes a situation where such a string would only be
interned in a single interpreter. Normal string objects are unaffected.

View file

@ -235,15 +235,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp)
return _Py_INTERP_CACHED_OBJECT(interp, interned_strings); return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
} }
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
Py_ssize_t Py_ssize_t
_PyUnicode_InternedSize(void) _PyUnicode_InternedSize(void)
{ {
return PyObject_Length(get_interned_dict(_PyInterpreterState_GET())); PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
}
static Py_hash_t unicode_hash(PyObject *);
static int unicode_compare_eq(PyObject *, PyObject *);
static Py_uhash_t
hashtable_unicode_hash(const void *key)
{
return unicode_hash((PyObject *)key);
}
static int
hashtable_unicode_compare(const void *key1, const void *key2)
{
PyObject *obj1 = (PyObject *)key1;
PyObject *obj2 = (PyObject *)key2;
if (obj1 != NULL && obj2 != NULL) {
return unicode_compare_eq(obj1, obj2);
}
else {
return obj1 == obj2;
}
} }
static int static int
init_interned_dict(PyInterpreterState *interp) init_interned_dict(PyInterpreterState *interp)
{ {
if (_Py_IsMainInterpreter(interp)) {
assert(INTERNED_STRINGS == NULL);
_Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
INTERNED_STRINGS = _Py_hashtable_new_full(
hashtable_unicode_hash,
hashtable_unicode_compare,
NULL,
NULL,
&hashtable_alloc
);
if (INTERNED_STRINGS == NULL) {
return -1;
}
}
assert(get_interned_dict(interp) == NULL); assert(get_interned_dict(interp) == NULL);
PyObject *interned = interned = PyDict_New(); PyObject *interned = interned = PyDict_New();
if (interned == NULL) { if (interned == NULL) {
@ -262,6 +301,10 @@ clear_interned_dict(PyInterpreterState *interp)
Py_DECREF(interned); Py_DECREF(interned);
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL; _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
} }
if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
_Py_hashtable_destroy(INTERNED_STRINGS);
INTERNED_STRINGS = NULL;
}
} }
#define _Py_RETURN_UNICODE_EMPTY() \ #define _Py_RETURN_UNICODE_EMPTY() \
@ -1222,6 +1265,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
_PyUnicode_STATE(unicode).kind = kind; _PyUnicode_STATE(unicode).kind = kind;
_PyUnicode_STATE(unicode).compact = 1; _PyUnicode_STATE(unicode).compact = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii; _PyUnicode_STATE(unicode).ascii = is_ascii;
_PyUnicode_STATE(unicode).statically_allocated = 0;
if (is_ascii) { if (is_ascii) {
((char*)data)[size] = 0; ((char*)data)[size] = 0;
} }
@ -1552,7 +1596,9 @@ unicode_dealloc(PyObject *unicode)
* we accidentally decref an immortal string out of existence. Since * we accidentally decref an immortal string out of existence. Since
* the string is an immortal object, just re-set the reference count. * the string is an immortal object, just re-set the reference count.
*/ */
if (PyUnicode_CHECK_INTERNED(unicode)) { if (PyUnicode_CHECK_INTERNED(unicode)
|| _PyUnicode_STATE(unicode).statically_allocated)
{
_Py_SetImmortal(unicode); _Py_SetImmortal(unicode);
return; return;
} }
@ -14502,6 +14548,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_STATE(self).kind = kind; _PyUnicode_STATE(self).kind = kind;
_PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
_PyUnicode_STATE(self).statically_allocated = 0;
_PyUnicode_UTF8_LENGTH(self) = 0; _PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL; _PyUnicode_UTF8(self) = NULL;
_PyUnicode_DATA_ANY(self) = NULL; _PyUnicode_DATA_ANY(self) = NULL;
@ -14725,6 +14772,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
return; return;
} }
/* Look in the global cache first. */
PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
if (r != NULL && r != s) {
Py_SETREF(*p, Py_NewRef(r));
return;
}
/* Handle statically allocated strings. */
if (_PyUnicode_STATE(s).statically_allocated) {
assert(_Py_IsImmortal(s));
if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
}
return;
}
/* Look in the per-interpreter cache. */
PyObject *interned = get_interned_dict(interp); PyObject *interned = get_interned_dict(interp);
assert(interned != NULL); assert(interned != NULL);
@ -14740,9 +14804,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
} }
if (_Py_IsImmortal(s)) { if (_Py_IsImmortal(s)) {
// XXX Restrict this to the main interpreter?
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC; _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
return; return;
} }
#ifdef Py_REF_DEBUG #ifdef Py_REF_DEBUG
/* The reference count value excluding the 2 references from the /* The reference count value excluding the 2 references from the
interned dictionary should be excluded from the RefTotal. The interned dictionary should be excluded from the RefTotal. The

View file

@ -128,6 +128,13 @@ _Py_hashtable_size(const _Py_hashtable_t *ht)
} }
size_t
_Py_hashtable_len(const _Py_hashtable_t *ht)
{
return ht->nentries;
}
_Py_hashtable_entry_t * _Py_hashtable_entry_t *
_Py_hashtable_get_entry_generic(_Py_hashtable_t *ht, const void *key) _Py_hashtable_get_entry_generic(_Py_hashtable_t *ht, const void *key)
{ {

View file

@ -208,6 +208,7 @@ class Printer:
self.write(".kind = 1,") self.write(".kind = 1,")
self.write(".compact = 1,") self.write(".compact = 1,")
self.write(".ascii = 1,") self.write(".ascii = 1,")
self.write(".statically_allocated = 1,")
self.write(f"._data = {make_string_literal(s.encode('ascii'))},") self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
return f"& {name}._ascii.ob_base" return f"& {name}._ascii.ob_base"
else: else:
@ -220,6 +221,7 @@ class Printer:
self.write(f".kind = {kind},") self.write(f".kind = {kind},")
self.write(".compact = 1,") self.write(".compact = 1,")
self.write(".ascii = 0,") self.write(".ascii = 0,")
self.write(".statically_allocated = 1,")
utf8 = s.encode('utf-8') utf8 = s.encode('utf-8')
self.write(f'.utf8 = {make_string_literal(utf8)},') self.write(f'.utf8 = {make_string_literal(utf8)},')
self.write(f'.utf8_length = {len(utf8)},') self.write(f'.utf8_length = {len(utf8)},')