bpo-46881: Statically allocate and initialize the latin1 characters. (GH-31616)

2025-11-25 04:34:37 +00:00 · 2022-03-10 04:32:00 +05:30 · 2022-03-10 04:32:00 +05:30 · 8714b6fa27
commit 8714b6fa27
parent e801e88744
6 changed files with 317 additions and 66 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -206,6 +206,11 @@ extern "C" {
            *_to++ = (to_type) *_iter++;                \
    } while (0)

+#define LATIN1(ch)  \
+    (ch < 128 \
+     ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
+     : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
+
 #ifdef MS_WINDOWS
   /* On Windows, overallocate by 50% is the best factor */
 #  define OVERALLOCATE_FACTOR 2
@ -249,14 +254,6 @@ static int unicode_is_singleton(PyObject *unicode);
 #endif


-static struct _Py_unicode_state*
-get_unicode_state(void)
-{
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    return &interp->unicode;
-}
-
-
 // Return a borrowed reference to the empty string singleton.
 static inline PyObject* unicode_get_empty(void)
 {
@ -680,24 +677,10 @@ unicode_result_ready(PyObject *unicode)
        if (kind == PyUnicode_1BYTE_KIND) {
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
            Py_UCS1 ch = data[0];
-            struct _Py_unicode_state *state = get_unicode_state();
-            PyObject *latin1_char = state->latin1[ch];
-            if (latin1_char != NULL) {
-                if (unicode != latin1_char) {
-                    Py_INCREF(latin1_char);
-                    Py_DECREF(unicode);
-                }
-                return latin1_char;
+            if (unicode != LATIN1(ch)) {
+                Py_DECREF(unicode);
            }
-            else {
-                assert(_PyUnicode_CheckConsistency(unicode, 1));
-                Py_INCREF(unicode);
-                state->latin1[ch] = unicode;
-                return unicode;
-            }
-        }
-        else {
-            assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
+            return get_latin1_char(ch);
        }
    }

@ -1990,11 +1973,10 @@ unicode_is_singleton(PyObject *unicode)
        return 1;
    }

-    struct _Py_unicode_state *state = get_unicode_state();
    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
-        if (ch < 256 && state->latin1[ch] == unicode) {
+        if (ch < 256 && LATIN1(ch) == unicode) {
            return 1;
        }
    }
@ -2137,25 +2119,7 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
 static PyObject*
 get_latin1_char(Py_UCS1 ch)
 {
-    struct _Py_unicode_state *state = get_unicode_state();
-
-    PyObject *unicode = state->latin1[ch];
-    if (unicode) {
-        Py_INCREF(unicode);
-        return unicode;
-    }
-
-    unicode = PyUnicode_New(1, ch);
-    if (!unicode) {
-        return NULL;
-    }
-
-    PyUnicode_1BYTE_DATA(unicode)[0] = ch;
-    assert(_PyUnicode_CheckConsistency(unicode, 1));
-
-    Py_INCREF(unicode);
-    state->latin1[ch] = unicode;
-    return unicode;
+    return Py_NewRef(LATIN1(ch));
 }

 static PyObject*
@ -15535,6 +15499,10 @@ _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)

 #ifdef Py_DEBUG
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
+
+    for (int i = 0; i < 256; i++) {
+        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
+    }
 #endif

    return _PyStatus_OK();
@ -16113,10 +16081,6 @@ _PyUnicode_Fini(PyInterpreterState *interp)
    _PyUnicode_FiniEncodings(&state->fs_codec);

    unicode_clear_identifiers(state);
-
-    for (Py_ssize_t i = 0; i < 256; i++) {
-        Py_CLEAR(state->latin1[i]);
-    }
 }