bpo-36346: Make using the legacy Unicode C API optional (GH-21437)

Add compile time option USE_UNICODE_WCHAR_CACHE. Setting it to 0 makes the interpreter not using the wchar_t cache and the legacy Unicode C API.
2025-10-08 16:11:51 +00:00 · 2020-07-10 23:26:06 +03:00 · 2020-07-10 23:26:06 +03:00 · 4c8f09d7ce
commit 4c8f09d7ce
parent 9650fe0197
17 changed files with 360 additions and 99 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3150,9 +3150,11 @@ unicode_get_widechar_size(PyObject *unicode)
    assert(unicode != NULL);
    assert(_PyUnicode_CHECK(unicode));

+#if USE_UNICODE_WCHAR_CACHE
    if (_PyUnicode_WSTR(unicode) != NULL) {
        return PyUnicode_WSTR_LENGTH(unicode);
    }
+#endif /* USE_UNICODE_WCHAR_CACHE */
    assert(PyUnicode_IS_READY(unicode));

    res = _PyUnicode_LENGTH(unicode);
@ -3173,16 +3175,21 @@ unicode_get_widechar_size(PyObject *unicode)
 static void
 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
 {
-    const wchar_t *wstr;
-
    assert(unicode != NULL);
    assert(_PyUnicode_CHECK(unicode));

-    wstr = _PyUnicode_WSTR(unicode);
+#if USE_UNICODE_WCHAR_CACHE
+    const wchar_t *wstr = _PyUnicode_WSTR(unicode);
    if (wstr != NULL) {
        memcpy(w, wstr, size * sizeof(wchar_t));
        return;
    }
+#else /* USE_UNICODE_WCHAR_CACHE */
+    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
+        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
+        return;
+    }
+#endif /* USE_UNICODE_WCHAR_CACHE */
    assert(PyUnicode_IS_READY(unicode));

    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
@ -4378,7 +4385,6 @@ unicode_decode_call_errorhandler_wchar(
    Py_ssize_t requiredsize;
    Py_ssize_t newpos;
    PyObject *inputobj = NULL;
-    wchar_t *repwstr;
    Py_ssize_t repwlen;

    if (*errorHandler == NULL) {
@ -4424,9 +4430,19 @@ unicode_decode_call_errorhandler_wchar(
        goto onError;
    }

-    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
-    if (repwstr == NULL)
+#if USE_UNICODE_WCHAR_CACHE
+_Py_COMP_DIAG_PUSH
+_Py_COMP_DIAG_IGNORE_DEPR_DECLS
+    repwlen = PyUnicode_GetSize(repunicode);
+    if (repwlen < 0)
        goto onError;
+_Py_COMP_DIAG_POP
+#else /* USE_UNICODE_WCHAR_CACHE */
+    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
+    if (repwlen < 0)
+        goto onError;
+    repwlen--;
+#endif /* USE_UNICODE_WCHAR_CACHE */
    /* need more space? (at least enough for what we
       have+the replacement+the rest of the string (starting
       at the new input position), so we won't have to check space
@ -4446,7 +4462,7 @@ unicode_decode_call_errorhandler_wchar(
            goto onError;
        }
    }
-    wcsncpy(*buf + *outpos, repwstr, repwlen);
+    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
    *outpos += repwlen;
    *endinpos = newpos;
    *inptr = *input + newpos;
@ -7748,6 +7764,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
    /* Create a substring so that we can get the UTF-16 representation
       of just the slice under consideration. */
    PyObject *substring;
+    int ret = -1;

    assert(len > 0);

@ -7759,11 +7776,22 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
    substring = PyUnicode_Substring(unicode, offset, offset+len);
    if (substring == NULL)
        return -1;
+#if USE_UNICODE_WCHAR_CACHE
+_Py_COMP_DIAG_PUSH
+_Py_COMP_DIAG_IGNORE_DEPR_DECLS
    p = PyUnicode_AsUnicodeAndSize(substring, &size);
    if (p == NULL) {
        Py_DECREF(substring);
        return -1;
    }
+_Py_COMP_DIAG_POP
+#else /* USE_UNICODE_WCHAR_CACHE */
+    p = PyUnicode_AsWideCharString(substring, &size);
+    Py_CLEAR(substring);
+    if (p == NULL) {
+        return -1;
+    }
+#endif /* USE_UNICODE_WCHAR_CACHE */
    assert(size <= INT_MAX);

    /* First get the size of the result */
@ -7775,16 +7803,15 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
        goto error;
    /* If we used a default char, then we failed! */
    if (pusedDefaultChar && *pusedDefaultChar) {
-        Py_DECREF(substring);
-        return -2;
+        ret = -2;
+        goto done;
    }

    if (*outbytes == NULL) {
        /* Create string object */
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
        if (*outbytes == NULL) {
-            Py_DECREF(substring);
-            return -1;
+            goto done;
        }
        out = PyBytes_AS_STRING(*outbytes);
    }
@ -7793,12 +7820,10 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
        const Py_ssize_t n = PyBytes_Size(*outbytes);
        if (outsize > PY_SSIZE_T_MAX - n) {
            PyErr_NoMemory();
-            Py_DECREF(substring);
-            return -1;
+            goto done;
        }
        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
-            Py_DECREF(substring);
-            return -1;
+            goto done;
        }
        out = PyBytes_AS_STRING(*outbytes) + n;
    }
@ -7808,19 +7833,29 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
                                  p, (int)size,
                                  out, outsize,
                                  NULL, pusedDefaultChar);
-    Py_CLEAR(substring);
    if (outsize <= 0)
        goto error;
-    if (pusedDefaultChar && *pusedDefaultChar)
-        return -2;
-    return 0;
+    if (pusedDefaultChar && *pusedDefaultChar) {
+        ret = -2;
+        goto done;
+    }
+    ret = 0;
+
+done:
+#if USE_UNICODE_WCHAR_CACHE
+    Py_DECREF(substring);
+#else /* USE_UNICODE_WCHAR_CACHE */
+    PyMem_Free(p);
+#endif /* USE_UNICODE_WCHAR_CACHE */
+    return ret;

 error:
-    Py_XDECREF(substring);
-    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
-        return -2;
+    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
+        ret = -2;
+        goto done;
+    }
    PyErr_SetFromWindowsErr(0);
-    return -1;
+    goto done;
 }

 /*