bpo-40521: Make Unicode latin1 singletons per interpreter (GH-21101)

Each interpreter now has its own Unicode latin1 singletons. Remove "ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS" and "ifdef LATIN1_SINGLETONS": always enable latin1 singletons. Optimize unicode_result_ready(): only attempt to get a latin1 singleton for PyUnicode_1BYTE_KIND.
2025-08-02 08:02:56 +00:00 · 2020-06-24 02:22:21 +02:00 · 2020-06-24 02:22:21 +02:00 · 2f9ada96e0
commit 2f9ada96e0
parent bbf36e8903
3 changed files with 36 additions and 43 deletions
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@ -73,6 +73,9 @@ struct _Py_bytes_state {
 struct _Py_unicode_state {
    // The empty Unicode object is a singleton to improve performance.
    PyObject *empty;
    /* Single character Unicode strings in the Latin-1 range are being
       shared as well. */
    PyObject *latin1[256];
    struct _Py_unicode_fs_codec fs_codec;
 };
--- a/Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
+++ b/Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst
@ -3,7 +3,7 @@ Each interpreter now its has own free lists, singletons and caches:
 * Free lists: float, tuple, list, dict, frame, context,
  asynchronous generator, MemoryError.
 * Singletons: empty tuple, empty bytes string, empty Unicode string,
-  single byte character.
+  single byte character, single Unicode (latin1) character.
 * Slice cache.
 They are no longer shared by all interpreters.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -303,17 +303,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 /* List of static strings. */
 static _Py_Identifier *static_strings = NULL;
 /* bpo-40521: Latin1 singletons are shared by all interpreters. */
 #ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
 #  define LATIN1_SINGLETONS
 #endif
 #ifdef LATIN1_SINGLETONS
 /* Single character Unicode strings in the Latin-1 range are being
   shared as well. */
 static PyObject *unicode_latin1[256] = {NULL};
 #endif
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
    0, 0, 0, 0, 0, 0, 0, 0,
@ -657,9 +646,8 @@ unicode_result_wchar(PyObject *unicode)
    if (len == 1) {
        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
        if ((Py_UCS4)ch < 256) {
            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
            Py_DECREF(unicode);
-            return latin1_char;
+            return get_latin1_char((unsigned char)ch);
        }
    }
@ -692,13 +680,13 @@ unicode_result_ready(PyObject *unicode)
        return empty;
    }
 #ifdef LATIN1_SINGLETONS
    if (length == 1) {
        const void *data = PyUnicode_DATA(unicode);
        int kind = PyUnicode_KIND(unicode);
-        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
+        if (kind == PyUnicode_1BYTE_KIND) {
-        if (ch < 256) {
+            Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
-            PyObject *latin1_char = unicode_latin1[ch];
+            Py_UCS1 ch = data[0];
            struct _Py_unicode_state *state = get_unicode_state();
            PyObject *latin1_char = state->latin1[ch];
            if (latin1_char != NULL) {
                if (unicode != latin1_char) {
                    Py_INCREF(latin1_char);
@ -709,12 +697,14 @@ unicode_result_ready(PyObject *unicode)
            else {
                assert(_PyUnicode_CheckConsistency(unicode, 1));
                Py_INCREF(unicode);
-                unicode_latin1[ch] = unicode;
+                state->latin1[ch] = unicode;
                return unicode;
            }
        }
        else {
            assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
        }
    }
 #endif
    assert(_PyUnicode_CheckConsistency(unicode, 1));
    return unicode;
@ -1981,18 +1971,18 @@ unicode_dealloc(PyObject *unicode)
 static int
 unicode_is_singleton(PyObject *unicode)
 {
-    if (unicode == unicode_get_empty()) {
+    struct _Py_unicode_state *state = get_unicode_state();
    if (unicode == state->empty) {
        return 1;
    }
 #ifdef LATIN1_SINGLETONS
    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
    {
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
-        if (ch < 256 && unicode_latin1[ch] == unicode)
+        if (ch < 256 && state->latin1[ch] == unicode) {
            return 1;
        }
    }
 #endif
    return 0;
 }
 #endif
@ -2130,17 +2120,15 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
 }
 static PyObject*
-get_latin1_char(unsigned char ch)
+get_latin1_char(Py_UCS1 ch)
 {
-    PyObject *unicode;
+    struct _Py_unicode_state *state = get_unicode_state();
-#ifdef LATIN1_SINGLETONS
+    PyObject *unicode = state->latin1[ch];
    unicode = unicode_latin1[ch];
    if (unicode) {
        Py_INCREF(unicode);
        return unicode;
    }
 #endif
    unicode = PyUnicode_New(1, ch);
    if (!unicode) {
@ -2150,10 +2138,8 @@ get_latin1_char(unsigned char ch)
    PyUnicode_1BYTE_DATA(unicode)[0] = ch;
    assert(_PyUnicode_CheckConsistency(unicode, 1));
 #ifdef LATIN1_SINGLETONS
    Py_INCREF(unicode);
-    unicode_latin1[ch] = unicode;
+    state->latin1[ch] = unicode;
 #endif
    return unicode;
 }
@ -2164,8 +2150,9 @@ unicode_char(Py_UCS4 ch)
    assert(ch <= MAX_UNICODE);
-    if (ch < 256)
+    if (ch < 256) {
        return get_latin1_char(ch);
    }
    unicode = PyUnicode_New(1, ch);
    if (unicode == NULL)
@ -2367,11 +2354,13 @@ _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
    PyObject *res;
    unsigned char max_char;
-    if (size == 0)
+    if (size == 0) {
        _Py_RETURN_UNICODE_EMPTY();
    }
    assert(size > 0);
-    if (size == 1)
+    if (size == 1) {
        return get_latin1_char(u[0]);
    }
    max_char = ucs1lib_find_max_char(u, u + size);
    res = PyUnicode_New(size, max_char);
@ -5008,8 +4997,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    if (size == 1 && (unsigned char)s[0] < 128) {
-        if (consumed)
+        if (consumed) {
            *consumed = 1;
        }
        return get_latin1_char((unsigned char)s[0]);
    }
@ -7176,8 +7166,9 @@ PyUnicode_DecodeASCII(const char *s,
        _Py_RETURN_UNICODE_EMPTY();
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
-    if (size == 1 && (unsigned char)s[0] < 128)
+    if (size == 1 && (unsigned char)s[0] < 128) {
        return get_latin1_char((unsigned char)s[0]);
    }
    // Shortcut for simple case
    PyObject *u = PyUnicode_New(size, 127);
@ -16234,12 +16225,11 @@ _PyUnicode_Fini(PyThreadState *tstate)
    Py_CLEAR(state->empty);
    for (Py_ssize_t i = 0; i < 256; i++) {
        Py_CLEAR(state->latin1[i]);
    }
    if (is_main_interp) {
 #ifdef LATIN1_SINGLETONS
        for (Py_ssize_t i = 0; i < 256; i++) {
            Py_CLEAR(unicode_latin1[i]);
        }
 #endif
        unicode_clear_static_strings();
    }