gh-122291: Intern latin-1 one-byte strings at startup (GH-122303)

This commit is contained in:
Petr Viktorin 2024-07-27 10:27:06 +02:00 committed by GitHub
parent c08696286f
commit bb09ba6792
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 43 additions and 65 deletions

View file

@ -325,7 +325,8 @@ init_global_interned_strings(PyInterpreterState *interp)
return _PyStatus_ERR("failed to create global interned dict");
}
/* Intern statically allocated string identifiers and deepfreeze strings.
/* Intern statically allocated string identifiers, deepfreeze strings,
* and one-byte latin-1 strings.
* This must be done before any module initialization so that statically
* allocated string identifiers are used instead of heap allocated strings.
* Deepfreeze uses the interned identifiers if present to save space
@ -333,14 +334,11 @@ init_global_interned_strings(PyInterpreterState *interp)
*/
_PyUnicode_InitStaticStrings(interp);
#ifdef Py_GIL_DISABLED
// In the free-threaded build, intern the 1-byte strings as well
for (int i = 0; i < 256; i++) {
PyObject *s = LATIN1(i);
_PyUnicode_InternStatic(interp, &s);
assert(s == LATIN1(i));
}
#endif
#ifdef Py_DEBUG
assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
@ -15355,26 +15353,14 @@ intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
assert(s != NULL);
assert(_PyUnicode_CHECK(s));
assert(_PyUnicode_STATE(s).statically_allocated);
switch (PyUnicode_CHECK_INTERNED(s)) {
case SSTATE_NOT_INTERNED:
break;
case SSTATE_INTERNED_IMMORTAL_STATIC:
return s;
default:
Py_FatalError("_PyUnicode_InternStatic called on wrong string");
}
assert(!PyUnicode_CHECK_INTERNED(s));
#ifdef Py_DEBUG
/* We must not add process-global interned string if there's already a
* per-interpreter interned_dict, which might contain duplicates.
* Except "short string" singletons: those are special-cased. */
*/
PyObject *interned = get_interned_dict(interp);
assert(interned == NULL || unicode_is_singleton(s));
#ifdef Py_GIL_DISABLED
// In the free-threaded build, don't allow even the short strings.
assert(interned == NULL);
#endif
#endif
/* Look in the global cache first. */
@ -15446,11 +15432,6 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
return s;
}
/* Handle statically allocated strings. */
if (_PyUnicode_STATE(s).statically_allocated) {
return intern_static(interp, s);
}
/* Is it already interned? */
switch (PyUnicode_CHECK_INTERNED(s)) {
case SSTATE_NOT_INTERNED:
@ -15467,6 +15448,9 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
return s;
}
/* Statically allocated strings must be already interned. */
assert(!_PyUnicode_STATE(s).statically_allocated);
#if Py_GIL_DISABLED
/* In the free-threaded build, all interned strings are immortal */
immortalize = 1;
@ -15477,13 +15461,11 @@ intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
immortalize = 1;
}
/* if it's a short string, get the singleton -- and intern it */
/* if it's a short string, get the singleton */
if (PyUnicode_GET_LENGTH(s) == 1 &&
PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
if (!PyUnicode_CHECK_INTERNED(r)) {
r = intern_static(interp, r);
}
assert(PyUnicode_CHECK_INTERNED(r));
Py_DECREF(s);
return r;
}