gh-100227: Move the Dict of Interned Strings to PyInterpreterState (gh-102339)

We can revisit the options for keeping it global later, if desired.  For now the approach seems quite complex, so we've gone with the simpler isolation solution in the meantime.

https://github.com/python/cpython/issues/100227
This commit is contained in:
Eric Snow 2023-03-28 12:52:28 -06:00 committed by GitHub
parent 7703def37e
commit ba65a065cf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 727 additions and 718 deletions

View file

@ -23,13 +23,6 @@ extern "C" {
// Only immutable objects should be considered runtime-global. // Only immutable objects should be considered runtime-global.
// All others must be per-interpreter. // All others must be per-interpreter.
#define _Py_CACHED_OBJECT(NAME) \
_PyRuntime.cached_objects.NAME
struct _Py_cached_objects {
PyObject *interned_strings;
};
#define _Py_GLOBAL_OBJECT(NAME) \ #define _Py_GLOBAL_OBJECT(NAME) \
_PyRuntime.static_objects.NAME _PyRuntime.static_objects.NAME
#define _Py_SINGLETON(NAME) \ #define _Py_SINGLETON(NAME) \
@ -65,6 +58,8 @@ struct _Py_static_objects {
(interp)->cached_objects.NAME (interp)->cached_objects.NAME
struct _Py_interp_cached_objects { struct _Py_interp_cached_objects {
PyObject *interned_strings;
/* AST */ /* AST */
PyObject *str_replace_inf; PyObject *str_replace_inf;

View file

@ -163,7 +163,6 @@ typedef struct pyruntimestate {
} types; } types;
/* All the objects that are shared by the runtime's interpreters. */ /* All the objects that are shared by the runtime's interpreters. */
struct _Py_cached_objects cached_objects;
struct _Py_static_objects static_objects; struct _Py_static_objects static_objects;
/* The following fields are here to avoid allocation during init. /* The following fields are here to avoid allocation during init.

View file

@ -59,6 +59,7 @@ struct _Py_unicode_state {
struct _Py_unicode_ids ids; struct _Py_unicode_ids ids;
}; };
extern void _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p);
extern void _PyUnicode_ClearInterned(PyInterpreterState *interp); extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);

File diff suppressed because it is too large Load diff

View file

@ -231,14 +231,32 @@ static inline PyObject* unicode_new_empty(void)
Another way to look at this is that to say that the actual reference Another way to look at this is that to say that the actual reference
count of a string is: s->ob_refcnt + (s->state ? 2 : 0) count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/ */
static inline PyObject *get_interned_dict(void) static inline PyObject *get_interned_dict(PyInterpreterState *interp)
{ {
return _Py_CACHED_OBJECT(interned_strings); return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
} }
static inline void set_interned_dict(PyObject *dict) static int
init_interned_dict(PyInterpreterState *interp)
{ {
_Py_CACHED_OBJECT(interned_strings) = dict; assert(get_interned_dict(interp) == NULL);
PyObject *interned = interned = PyDict_New();
if (interned == NULL) {
return -1;
}
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
return 0;
}
static void
clear_interned_dict(PyInterpreterState *interp)
{
PyObject *interned = get_interned_dict(interp);
if (interned != NULL) {
PyDict_Clear(interned);
Py_DECREF(interned);
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
}
} }
#define _Py_RETURN_UNICODE_EMPTY() \ #define _Py_RETURN_UNICODE_EMPTY() \
@ -1520,12 +1538,12 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
static void static void
unicode_dealloc(PyObject *unicode) unicode_dealloc(PyObject *unicode)
{ {
PyInterpreterState *interp = _PyInterpreterState_GET();
#ifdef Py_DEBUG #ifdef Py_DEBUG
if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) { if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
_Py_FatalRefcountError("deallocating an Unicode singleton"); _Py_FatalRefcountError("deallocating an Unicode singleton");
} }
#endif #endif
PyObject *interned = get_interned_dict();
if (PyUnicode_CHECK_INTERNED(unicode)) { if (PyUnicode_CHECK_INTERNED(unicode)) {
/* Revive the dead object temporarily. PyDict_DelItem() removes two /* Revive the dead object temporarily. PyDict_DelItem() removes two
references (key and value) which were ignored by references (key and value) which were ignored by
@ -1534,6 +1552,8 @@ unicode_dealloc(PyObject *unicode)
PyDict_DelItem(). */ PyDict_DelItem(). */
assert(Py_REFCNT(unicode) == 0); assert(Py_REFCNT(unicode) == 0);
Py_SET_REFCNT(unicode, 3); Py_SET_REFCNT(unicode, 3);
PyObject *interned = get_interned_dict(interp);
assert(interned != NULL);
if (PyDict_DelItem(interned, unicode) != 0) { if (PyDict_DelItem(interned, unicode) != 0) {
_PyErr_WriteUnraisableMsg("deletion of interned string failed", _PyErr_WriteUnraisableMsg("deletion of interned string failed",
NULL); NULL);
@ -14529,34 +14549,29 @@ _PyUnicode_InitState(PyInterpreterState *interp)
PyStatus PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp) _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
{ {
if (!_Py_IsMainInterpreter(interp)) {
return _PyStatus_OK();
}
// Initialize the global interned dict // Initialize the global interned dict
PyObject *interned = PyDict_New(); if (init_interned_dict(interp)) {
if (interned == NULL) {
PyErr_Clear(); PyErr_Clear();
return _PyStatus_ERR("failed to create interned dict"); return _PyStatus_ERR("failed to create interned dict");
} }
set_interned_dict(interned); if (_Py_IsMainInterpreter(interp)) {
/* Intern statically allocated string identifiers and deepfreeze strings.
/* Intern statically allocated string identifiers and deepfreeze strings. * This must be done before any module initialization so that statically
* This must be done before any module initialization so that statically * allocated string identifiers are used instead of heap allocated strings.
* allocated string identifiers are used instead of heap allocated strings. * Deepfreeze uses the interned identifiers if present to save space
* Deepfreeze uses the interned identifiers if present to save space * else generates them and they are interned to speed up dict lookups.
* else generates them and they are interned to speed up dict lookups. */
*/ _PyUnicode_InitStaticStrings(interp);
_PyUnicode_InitStaticStrings();
#ifdef Py_DEBUG #ifdef Py_DEBUG
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1)); assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
for (int i = 0; i < 256; i++) { for (int i = 0; i < 256; i++) {
assert(_PyUnicode_CheckConsistency(LATIN1(i), 1)); assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
} }
#endif #endif
}
return _PyStatus_OK(); return _PyStatus_OK();
} }
@ -14586,7 +14601,7 @@ error:
void void
PyUnicode_InternInPlace(PyObject **p) _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
{ {
PyObject *s = *p; PyObject *s = *p;
#ifdef Py_DEBUG #ifdef Py_DEBUG
@ -14608,7 +14623,7 @@ PyUnicode_InternInPlace(PyObject **p)
return; return;
} }
PyObject *interned = get_interned_dict(); PyObject *interned = get_interned_dict(interp);
assert(interned != NULL); assert(interned != NULL);
PyObject *t = PyDict_SetDefault(interned, s, s); PyObject *t = PyDict_SetDefault(interned, s, s);
@ -14629,6 +14644,13 @@ PyUnicode_InternInPlace(PyObject **p)
_PyUnicode_STATE(s).interned = 1; _PyUnicode_STATE(s).interned = 1;
} }
void
PyUnicode_InternInPlace(PyObject **p)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyUnicode_InternInPlace(interp, p);
}
// Function kept for the stable ABI. // Function kept for the stable ABI.
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
void void
@ -14653,12 +14675,7 @@ PyUnicode_InternFromString(const char *cp)
void void
_PyUnicode_ClearInterned(PyInterpreterState *interp) _PyUnicode_ClearInterned(PyInterpreterState *interp)
{ {
if (!_Py_IsMainInterpreter(interp)) { PyObject *interned = get_interned_dict(interp);
// interned dict is shared by all interpreters
return;
}
PyObject *interned = get_interned_dict();
if (interned == NULL) { if (interned == NULL) {
return; return;
} }
@ -14693,9 +14710,7 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp)
total_length); total_length);
#endif #endif
PyDict_Clear(interned); clear_interned_dict(interp);
Py_DECREF(interned);
set_interned_dict(NULL);
} }
@ -15108,7 +15123,7 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
static inline int static inline int
unicode_is_finalizing(void) unicode_is_finalizing(void)
{ {
return (get_interned_dict() == NULL); return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
} }
#endif #endif
@ -15131,14 +15146,13 @@ _PyUnicode_Fini(PyInterpreterState *interp)
{ {
struct _Py_unicode_state *state = &interp->unicode; struct _Py_unicode_state *state = &interp->unicode;
if (_Py_IsMainInterpreter(interp)) { // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
// _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini() assert(get_interned_dict(interp) == NULL);
assert(get_interned_dict() == NULL);
// bpo-47182: force a unicodedata CAPI capsule re-import on
// subsequent initialization of main interpreter.
}
_PyUnicode_FiniEncodings(&state->fs_codec); _PyUnicode_FiniEncodings(&state->fs_codec);
// bpo-47182: force a unicodedata CAPI capsule re-import on
// subsequent initialization of interpreter.
interp->unicode.ucnhash_capi = NULL; interp->unicode.ucnhash_capi = NULL;
unicode_clear_identifiers(state); unicode_clear_identifiers(state);

View file

@ -354,14 +354,14 @@ def generate_static_strings_initializer(identifiers, strings):
printer.write(before) printer.write(before)
printer.write(START) printer.write(START)
printer.write("static inline void") printer.write("static inline void")
with printer.block("_PyUnicode_InitStaticStrings(void)"): with printer.block("_PyUnicode_InitStaticStrings(PyInterpreterState *interp)"):
printer.write(f'PyObject *string;') printer.write(f'PyObject *string;')
for i in sorted(identifiers): for i in sorted(identifiers):
# This use of _Py_ID() is ignored by iter_global_strings() # This use of _Py_ID() is ignored by iter_global_strings()
# since iter_files() ignores .h files. # since iter_files() ignores .h files.
printer.write(f'string = &_Py_ID({i});') printer.write(f'string = &_Py_ID({i});')
printer.write(f'assert(_PyUnicode_CheckConsistency(string, 1));') printer.write(f'assert(_PyUnicode_CheckConsistency(string, 1));')
printer.write(f'PyUnicode_InternInPlace(&string);') printer.write(f'_PyUnicode_InternInPlace(interp, &string);')
# XXX What about "strings"? # XXX What about "strings"?
printer.write(END) printer.write(END)
printer.write(after) printer.write(after)