Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace

UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE and 32 bits wchar_t (eg. Linux in narrow build).
2025-12-10 11:00:14 +00:00 · 2010-10-02 11:11:27 +00:00 · 2010-10-02 11:11:27 +00:00 · 5593d8aeb4
commit 5593d8aeb4
parent 1c24bd0252
3 changed files with 131 additions and 24 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -1419,6 +1419,17 @@ class UnicodeTest(string_tests.CommonTest,
        self.assertEquals(size, 7)
        self.assertEquals(wchar, 'abc\0def\0')
        nonbmp = chr(0x10ffff)
        if sizeof(c_wchar) == 2:
            buflen = 3
            nchar = 2
        else: # sizeof(c_wchar) == 4
            buflen = 2
            nchar = 1
        wchar, size = test_aswidechar(nonbmp, buflen)
        self.assertEquals(size, nchar)
        self.assertEquals(wchar, nonbmp + '\0')
    # Test PyUnicode_AsWideCharString()
    def test_aswidecharstring(self):
        from _testcapi import test_aswidecharstring
@ -1432,6 +1443,15 @@ class UnicodeTest(string_tests.CommonTest,
        self.assertEquals(size, 7)
        self.assertEquals(wchar, 'abc\0def\0')
        nonbmp = chr(0x10ffff)
        if sizeof(c_wchar) == 2:
            nchar = 2
        else: # sizeof(c_wchar) == 4
            nchar = 1
        wchar, size = test_aswidecharstring(nonbmp)
        self.assertEquals(size, nchar)
        self.assertEquals(wchar, nonbmp + '\0')
 def test_main():
    support.run_unittest(__name__)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,10 @@ What's New in Python 3.2 Alpha 3?
 Core and Builtins
 -----------------
 - Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace
  UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE
  and 32 bits wchar_t (eg. Linux in narrow build).
 - Issue #10006: type.__abstractmethods__ now raises an AttributeError.
 - Issue #10003: Allow handling of SIGBREAK on Windows. Fixes a regression
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1153,19 +1153,112 @@ PyUnicode_FromFormat(const char *format, ...)
    return ret;
 }
-static void
+/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
   convert a Unicode object to a wide character string.
   - If w is NULL: return the number of wide characters (including the nul
     character) required to convert the unicode object. Ignore size argument.
   - Otherwise: return the number of wide characters (excluding the nul
     character) written into w. Write at most size wide characters (including
     the nul character). */
 static Py_ssize_t
 unicode_aswidechar(PyUnicodeObject *unicode,
                   wchar_t *w,
                   Py_ssize_t size)
 {
 #if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
    Py_ssize_t res;
    if (w != NULL) {
        res = PyUnicode_GET_SIZE(unicode);
        if (size > res)
            size = res + 1;
        else
            res = size;
        memcpy(w, unicode->str, size * sizeof(wchar_t));
-#else
+        return res;
-    register Py_UNICODE *u;
+    }
-    register Py_ssize_t i;
+    else
        return PyUnicode_GET_SIZE(unicode) + 1;
 #elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
    register const Py_UNICODE *u;
    const Py_UNICODE *uend;
    const wchar_t *worig, *wend;
    Py_ssize_t nchar;
    u = PyUnicode_AS_UNICODE(unicode);
-    for (i = size; i > 0; i--)
+    uend = u + PyUnicode_GET_SIZE(unicode);
-        *w++ = *u++;
+    if (w != NULL) {
        worig = w;
        wend = w + size;
        while (u != uend && w != wend) {
            if (0xD800 <= u[0] && u[0] <= 0xDBFF
                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
            {
                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
                u += 2;
            }
            else {
                *w = *u;
                u++;
            }
            w++;
        }
        if (w != wend)
            *w = L'\0';
        return w - worig;
    }
    else {
        nchar = 1; /* nul character at the end */
        while (u != uend) {
            if (0xD800 <= u[0] && u[0] <= 0xDBFF
                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
                u += 2;
            else
                u++;
            nchar++;
        }
    }
    return nchar;
 #elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
    register Py_UNICODE *u, *uend, ordinal;
    register Py_ssize_t i;
    wchar_t *worig, *wend;
    Py_ssize_t nchar;
    u = PyUnicode_AS_UNICODE(unicode);
    uend = u + PyUnicode_GET_SIZE(u);
    if (w != NULL) {
        worig = w;
        wend = w + size;
        while (u != uend && w != wend) {
            ordinal = *u;
            if (ordinal > 0xffff) {
                ordinal -= 0x10000;
                *w++ = 0xD800 | (ordinal >> 10);
                *w++ = 0xDC00 | (ordinal & 0x3FF);
            }
            else
                *w++ = ordinal;
            u++;
        }
        if (w != wend)
            *w = 0;
        return w - worig;
    }
    else {
        nchar = 1; /* nul character */
        while (u != uend) {
            if (*u > 0xffff)
                nchar += 2;
            else
                nchar++;
            u++;
        }
        return nchar;
    }
 #else
 #  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
 #endif
 }
@ -1178,17 +1271,7 @@ PyUnicode_AsWideChar(PyUnicodeObject *unicode,
        PyErr_BadInternalCall();
        return -1;
    }
-
+    return unicode_aswidechar(unicode, w, size);
    /* If possible, try to copy the 0-termination as well */
    if (size > PyUnicode_GET_SIZE(unicode))
        size = PyUnicode_GET_SIZE(unicode) + 1;
    unicode_aswidechar(unicode, w, size);
    if (size > PyUnicode_GET_SIZE(unicode))
        return PyUnicode_GET_SIZE(unicode);
    else
        return size;
 }
 wchar_t*
@ -1203,20 +1286,20 @@ PyUnicode_AsWideCharString(PyUnicodeObject *unicode,
        return NULL;
    }
-    if ((PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) < PyUnicode_GET_SIZE(unicode)) {
+    buflen = unicode_aswidechar(unicode, NULL, 0);
    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
        PyErr_NoMemory();
        return NULL;
    }
    buflen = PyUnicode_GET_SIZE(unicode) + 1; /* copy L'\0' */
    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
    if (buffer == NULL) {
        PyErr_NoMemory();
        return NULL;
    }
-    unicode_aswidechar(unicode, buffer, buflen);
+    buflen = unicode_aswidechar(unicode, buffer, buflen);
-    if (size)
+    if (size != NULL)
-        *size = buflen - 1;
+        *size = buflen;
    return buffer;
 }