bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096)

2025-11-17 01:25:57 +00:00 · 2021-04-30 15:21:42 +02:00 · 2021-04-30 15:21:42 +02:00 · 9032cf5cb1
commit 9032cf5cb1
parent 4908fae3d5
6 changed files with 194 additions and 0 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -57,6 +57,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include <windows.h>
 #endif

+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+#include "pycore_fileutils.h"     // _Py_LocaleUsesNonUnicodeWchar()
+#endif
+
 /* Uncomment to display statistics on interned strings at exit
   in _PyUnicode_ClearInterned(). */
 /* #define INTERNED_STATS 1 */
@ -2217,6 +2221,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
    if (size == 0)
        _Py_RETURN_UNICODE_EMPTY();

+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion to UCS-4 first. */
+    if (_Py_LocaleUsesNonUnicodeWchar()) {
+        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
+        if (!converted) {
+            return NULL;
+        }
+        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
+        PyMem_Free(converted);
+        return unicode;
+    }
+#endif
+
    /* Single character Unicode objects in the Latin-1 range are
       shared when using this constructor */
    if (size == 1 && (Py_UCS4)*u < 256)
@ -3295,6 +3313,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
        res = size;
    }
    unicode_copy_as_widechar(unicode, w, size);
+
+#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion first. */
+    if (_Py_LocaleUsesNonUnicodeWchar()) {
+        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
+            return -1;
+        }
+    }
+#endif
+
    return res;
 }

@ -3321,6 +3350,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
        return NULL;
    }
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
+
+#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion first. */
+    if (_Py_LocaleUsesNonUnicodeWchar()) {
+        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
+            return NULL;
+        }
+    }
+#endif
+
    if (size != NULL) {
        *size = buflen;
    }