bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)

Modify locale.localeconv(), time.tzname, os.strerror() and other functions to ignore the UTF-8 Mode: always use the current locale encoding. Changes: * Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or encoding error, they return the position of the error and an error message which are used to raise Unicode errors in PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale(). * Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx(). * PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all cases, especially for the strict error handler. * Add _Py_DecodeUTF8Ex(): return more information on decoding error and supports the strict error handler. * Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex(). * Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx(). * Ignore the UTF-8 mode to encode/decode localeconv(), strerror() and time zone name. * Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use the "current" locale. * Remove _PyUnicode_DecodeCurrentLocale(), _PyUnicode_DecodeCurrentLocaleAndSize() and _PyUnicode_EncodeCurrentLocale().
2025-11-02 19:12:55 +00:00 · 2018-01-15 10:45:49 +01:00 · 2018-01-15 10:45:49 +01:00 · 7ed7aead95
commit 7ed7aead95
parent ee3b83547c
12 changed files with 484 additions and 517 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
    return NULL;
 }

-static size_t
-wcstombs_errorpos(const wchar_t *wstr)
-{
-    size_t len;
-#if SIZEOF_WCHAR_T == 2
-    wchar_t buf[3];
-#else
-    wchar_t buf[2];
-#endif
-    char outbuf[MB_LEN_MAX];
-    const wchar_t *start, *previous;
-
-#if SIZEOF_WCHAR_T == 2
-    buf[2] = 0;
-#else
-    buf[1] = 0;
-#endif
-    start = wstr;
-    while (*wstr != L'\0')
-    {
-        previous = wstr;
-#if SIZEOF_WCHAR_T == 2
-        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
-            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
-        {
-            buf[0] = wstr[0];
-            buf[1] = wstr[1];
-            wstr += 2;
-        }
-        else {
-            buf[0] = *wstr;
-            buf[1] = 0;
-            wstr++;
-        }
-#else
-        buf[0] = *wstr;
-        wstr++;
-#endif
-        len = wcstombs(outbuf, buf, sizeof(outbuf));
-        if (len == (size_t)-1)
-            return previous - start;
-    }
-
-    /* failed to find the unencodable character */
-    return 0;
-}
-
 static int
 locale_error_handler(const char *errors, int *surrogateescape)
 {
@ -3396,130 +3349,60 @@ locale_error_handler(const char *errors, int *surrogateescape)
 }

 static PyObject *
-unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale)
+unicode_encode_locale(PyObject *unicode, const char *errors,
+                      int current_locale)
 {
-    Py_ssize_t wlen, wlen2;
-    wchar_t *wstr;
-    char *errmsg;
-    PyObject *bytes, *reason, *exc;
-    size_t error_pos, errlen;
    int surrogateescape;
-
    if (locale_error_handler(errors, &surrogateescape) < 0)
        return NULL;

-    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
-    if (wstr == NULL)
+    Py_ssize_t wlen;
+    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
+    if (wstr == NULL) {
        return NULL;
+    }

-    wlen2 = wcslen(wstr);
+    Py_ssize_t wlen2 = wcslen(wstr);
    if (wlen2 != wlen) {
        PyMem_Free(wstr);
        PyErr_SetString(PyExc_ValueError, "embedded null character");
        return NULL;
    }

-    if (surrogateescape) {
-        /* "surrogateescape" error handler */
-        char *str;
-
-        if (current_locale) {
-            str = _Py_EncodeCurrentLocale(wstr, &error_pos);
+    char *str;
+    size_t error_pos;
+    const char *reason;
+    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
+                                 current_locale, surrogateescape);
+    if (res != 0) {
+        if (res == -2) {
+            PyObject *exc;
+            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
+                    "locale", unicode,
+                    (Py_ssize_t)error_pos,
+                    (Py_ssize_t)(error_pos+1),
+                    reason);
+            if (exc != NULL) {
+                PyCodec_StrictErrors(exc);
+                Py_DECREF(exc);
+            }
+            return NULL;
        }
        else {
-            str = Py_EncodeLocale(wstr, &error_pos);
-        }
-        if (str == NULL) {
-            if (error_pos == (size_t)-1) {
-                PyErr_NoMemory();
-                PyMem_Free(wstr);
-                return NULL;
-            }
-            else {
-                goto encode_error;
-            }
-        }
-        PyMem_Free(wstr);
-
-        bytes = PyBytes_FromString(str);
-        if (current_locale) {
-            PyMem_RawFree(str);
-        }
-        else {
-            PyMem_Free(str);
-        }
-    }
-    else {
-        /* strict mode */
-        size_t len, len2;
-
-        len = wcstombs(NULL, wstr, 0);
-        if (len == (size_t)-1) {
-            error_pos = (size_t)-1;
-            goto encode_error;
-        }
-
-        bytes = PyBytes_FromStringAndSize(NULL, len);
-        if (bytes == NULL) {
+            PyErr_NoMemory();
            PyMem_Free(wstr);
            return NULL;
        }
-
-        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
-        if (len2 == (size_t)-1 || len2 > len) {
-            Py_DECREF(bytes);
-            error_pos = (size_t)-1;
-            goto encode_error;
-        }
-        PyMem_Free(wstr);
    }
-    return bytes;
-
-encode_error:
-    errmsg = strerror(errno);
-    assert(errmsg != NULL);
-
-    if (error_pos == (size_t)-1)
-        error_pos = wcstombs_errorpos(wstr);
-
    PyMem_Free(wstr);

-    wstr = Py_DecodeLocale(errmsg, &errlen);
-    if (wstr != NULL) {
-        reason = PyUnicode_FromWideChar(wstr, errlen);
-        PyMem_RawFree(wstr);
-    } else {
-        errmsg = NULL;
-    }
-
-    if (errmsg == NULL)
-        reason = PyUnicode_FromString(
-            "wcstombs() encountered an unencodable "
-            "wide character");
-    if (reason == NULL)
-        return NULL;
-
-    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
-                                "locale", unicode,
-                                (Py_ssize_t)error_pos,
-                                (Py_ssize_t)(error_pos+1),
-                                reason);
-    Py_DECREF(reason);
-    if (exc != NULL) {
-        PyCodec_StrictErrors(exc);
-        Py_DECREF(exc);
-    }
-    return NULL;
+    PyObject *bytes = PyBytes_FromString(str);
+    PyMem_RawFree(str);
+    return bytes;
 }

 PyObject *
 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
-{
-    return unicode_encode_locale(unicode, errors, 0);
-}
-
-PyObject *
-_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors)
 {
    return unicode_encode_locale(unicode, errors, 1);
 }
@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
    return NULL;
 }

-static size_t
-mbstowcs_errorpos(const char *str, size_t len)
-{
-#ifdef HAVE_MBRTOWC
-    const char *start = str;
-    mbstate_t mbs;
-    size_t converted;
-    wchar_t ch;
-
-    memset(&mbs, 0, sizeof mbs);
-    while (len)
-    {
-        converted = mbrtowc(&ch, str, len, &mbs);
-        if (converted == 0)
-            /* Reached end of string */
-            break;
-        if (converted == (size_t)-1 || converted == (size_t)-2) {
-            /* Conversion error or incomplete character */
-            return str - start;
-        }
-        else {
-            str += converted;
-            len -= converted;
-        }
-    }
-    /* failed to find the undecodable byte sequence */
-    return 0;
-#endif
-    return 0;
-}
-
 static PyObject*
 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
                      int current_locale)
 {
-    wchar_t smallbuf[256];
-    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
-    wchar_t *wstr;
-    size_t wlen, wlen2;
-    PyObject *unicode;
    int surrogateescape;
-    size_t error_pos, errlen;
-    char *errmsg;
-    PyObject *exc, *reason = NULL;   /* initialize to prevent gcc warning */
-
    if (locale_error_handler(errors, &surrogateescape) < 0)
        return NULL;

@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
        return NULL;
    }

-    if (surrogateescape) {
-        /* "surrogateescape" error handler */
-        if (current_locale) {
-            wstr = _Py_DecodeCurrentLocale(str, &wlen);
+    wchar_t *wstr;
+    size_t wlen;
+    const char *reason;
+    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
+                                 current_locale, surrogateescape);
+    if (res != 0) {
+        if (res == -2) {
+            PyObject *exc;
+            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
+                                        "locale", str, len,
+                                        (Py_ssize_t)wlen,
+                                        (Py_ssize_t)(wlen + 1),
+                                        reason);
+            if (exc != NULL) {
+                PyCodec_StrictErrors(exc);
+                Py_DECREF(exc);
+            }
        }
        else {
-            wstr = Py_DecodeLocale(str, &wlen);
+            PyErr_NoMemory();
        }
-        if (wstr == NULL) {
-            if (wlen == (size_t)-1)
-                PyErr_NoMemory();
-            else
-                PyErr_SetFromErrno(PyExc_OSError);
-            return NULL;
-        }
-
-        unicode = PyUnicode_FromWideChar(wstr, wlen);
-        PyMem_RawFree(wstr);
-    }
-    else {
-        /* strict mode */
-#ifndef HAVE_BROKEN_MBSTOWCS
-        wlen = mbstowcs(NULL, str, 0);
-#else
-        wlen = len;
-#endif
-        if (wlen == (size_t)-1)
-            goto decode_error;
-        if (wlen+1 <= smallbuf_len) {
-            wstr = smallbuf;
-        }
-        else {
-            wstr = PyMem_New(wchar_t, wlen+1);
-            if (!wstr)
-                return PyErr_NoMemory();
-        }
-
-        wlen2 = mbstowcs(wstr, str, wlen+1);
-        if (wlen2 == (size_t)-1) {
-            if (wstr != smallbuf)
-                PyMem_Free(wstr);
-            goto decode_error;
-        }
-#ifdef HAVE_BROKEN_MBSTOWCS
-        assert(wlen2 == wlen);
-#endif
-        unicode = PyUnicode_FromWideChar(wstr, wlen2);
-        if (wstr != smallbuf)
-            PyMem_Free(wstr);
-    }
-    return unicode;
-
-decode_error:
-    errmsg = strerror(errno);
-    assert(errmsg != NULL);
-
-    error_pos = mbstowcs_errorpos(str, len);
-    wstr = Py_DecodeLocale(errmsg, &errlen);
-    if (wstr != NULL) {
-        reason = PyUnicode_FromWideChar(wstr, errlen);
-        PyMem_RawFree(wstr);
-    }
-
-    if (reason == NULL)
-        reason = PyUnicode_FromString(
-            "mbstowcs() encountered an invalid multibyte sequence");
-    if (reason == NULL)
        return NULL;
-
-    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
-                                "locale", str, len,
-                                (Py_ssize_t)error_pos,
-                                (Py_ssize_t)(error_pos+1),
-                                reason);
-    Py_DECREF(reason);
-    if (exc != NULL) {
-        PyCodec_StrictErrors(exc);
-        Py_DECREF(exc);
    }
-    return NULL;
+
+    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
+    PyMem_RawFree(wstr);
+    return unicode;
 }

 PyObject*
 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
                              const char *errors)
-{
-    return unicode_decode_locale(str, len, errors, 0);
-}
-
-PyObject*
-_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len,
-                                      const char *errors)
 {
    return unicode_decode_locale(str, len, errors, 1);
 }

-PyObject*
-_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors)
-{
-    return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1);
-}
-
 PyObject*
 PyUnicode_DecodeLocale(const char *str, const char *errors)
 {
    Py_ssize_t size = (Py_ssize_t)strlen(str);
-    return unicode_decode_locale(str, size, errors, 0);
+    return unicode_decode_locale(str, size, errors, 1);
 }


@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
                                Py_FileSystemDefaultEncodeErrors);
    }
    else {
-        return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
+        return unicode_decode_locale(s, size,
+                                     Py_FileSystemDefaultEncodeErrors, 0);
    }
 #endif
 }
@ -5128,17 +4906,23 @@ onError:
 }


-/* UTF-8 decoder using the surrogateescape error handler .
+/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
+   non-zero, use strict error handler otherwise.

-   On success, return a pointer to a newly allocated wide character string (use
-   PyMem_RawFree() to free the memory) and write the output length (in number
-   of wchar_t units) into *p_wlen (if p_wlen is set).
+   On success, write a pointer to a newly allocated wide character string into
+   *wstr (use PyMem_RawFree() to free the memory) and write the output length
+   (in number of wchar_t units) into *wlen (if wlen is set).

-   On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
-   (if p_wlen is set). */
-wchar_t*
-_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
+   On memory allocation failure, return -1.
+
+   On decoding error (if surrogateescape is zero), return -2. If wlen is
+   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
+   is not NULL, write the decoding error message into *reason. */
+int
+_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
+                 const char **reason, int surrogateescape)
 {
+    const char *orig_s = s;
    const char *e;
    wchar_t *unicode;
    Py_ssize_t outpos;
@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
    /* Note: size will always be longer than the resulting Unicode
       character count */
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
-        if (p_wlen) {
-            *p_wlen = (size_t)-1;
-        }
-        return NULL;
+        return -1;
    }

    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
    if (!unicode) {
-        if (p_wlen) {
-            *p_wlen = (size_t)-1;
-        }
-        return NULL;
+        return -1;
    }

    /* Unpack UTF-8 encoded data */
@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
            Py_UNREACHABLE();
 #else
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
-            /*  compute and append the two surrogates: */
+            /* write a surrogate pair */
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
 #endif
@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
        else {
            if (!ch && s == e)
                break;
+            if (!surrogateescape) {
+                PyMem_RawFree(unicode );
+                if (reason != NULL) {
+                    switch (ch) {
+                    case 0:
+                        *reason = "unexpected end of data";
+                        break;
+                    case 1:
+                        *reason = "invalid start byte";
+                        break;
+                    /* 2, 3, 4 */
+                    default:
+                        *reason = "invalid continuation byte";
+                        break;
+                    }
+                }
+                if (wlen != NULL) {
+                    *wlen = s - orig_s;
+                }
+                return -2;
+            }
            /* surrogateescape */
            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
        }
    }
    unicode[outpos] = L'\0';
-    if (p_wlen) {
-        *p_wlen = outpos;
+    if (wlen) {
+        *wlen = outpos;
    }
-    return unicode;
+    *wstr = unicode;
+    return 0;
+}
+
+wchar_t*
+_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
+{
+    wchar_t *wstr;
+    int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
+    if (res != 0) {
+        return NULL;
+    }
+    return wstr;
 }


 /* UTF-8 encoder using the surrogateescape error handler .

-   On success, return a pointer to a newly allocated character string (use
-   PyMem_Free() to free the memory).
+   On success, return 0 and write the newly allocated character string (use
+   PyMem_Free() to free the memory) into *str.

-   On encoding failure, return NULL and write the position of the invalid
-   surrogate character into *error_pos (if error_pos is set).
+   On encoding failure, return -2 and write the position of the invalid
+   surrogate character into *error_pos (if error_pos is set) and the decoding
+   error message into *reason (if reason is set).

-   On memory allocation failure, return NULL and write (size_t)-1 into
-   *error_pos (if error_pos is set). */
-char*
-_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
-                               int raw_malloc)
+   On memory allocation failure, return -1. */
+int
+_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
+                 const char **reason, int raw_malloc, int surrogateescape)
 {
    const Py_ssize_t max_char_size = 4;
    Py_ssize_t len = wcslen(text);

    assert(len >= 0);

+    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
+        return -1;
+    }
    char *bytes;
-    if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
-        if (raw_malloc) {
-            bytes = PyMem_RawMalloc((len + 1) * max_char_size);
-        }
-        else {
-            bytes = PyMem_Malloc((len + 1) * max_char_size);
-        }
+    if (raw_malloc) {
+        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
    }
    else {
-        bytes = NULL;
+        bytes = PyMem_Malloc((len + 1) * max_char_size);
    }
    if (bytes == NULL) {
-        if (error_pos != NULL) {
-            *error_pos = (size_t)-1;
-        }
-        return NULL;
+        return -1;
    }

    char *p = bytes;
    Py_ssize_t i;
-    for (i = 0; i < len;) {
-        Py_UCS4 ch = text[i++];
+    for (i = 0; i < len; i++) {
+        Py_UCS4 ch = text[i];

        if (ch < 0x80) {
            /* Encode ASCII */
@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
        }
        else if (Py_UNICODE_IS_SURROGATE(ch)) {
            /* surrogateescape error handler */
-            if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
                if (error_pos != NULL) {
-                    *error_pos = (size_t)i - 1;
+                    *error_pos = (size_t)i;
                }
-                goto error;
+                if (reason != NULL) {
+                    *reason = "encoding error";
+                }
+                if (raw_malloc) {
+                    PyMem_RawFree(bytes);
+                }
+                else {
+                    PyMem_Free(bytes);
+                }
+                return -2;
            }
            *p++ = (char)(ch & 0xff);
        }
@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
        if (error_pos != NULL) {
            *error_pos = (size_t)-1;
        }
-        goto error;
+        if (raw_malloc) {
+            PyMem_RawFree(bytes);
+        }
+        else {
+            PyMem_Free(bytes);
+        }
+        return -1;
    }
-    return bytes2;
-
- error:
-    if (raw_malloc) {
-        PyMem_RawFree(bytes);
-    }
-    else {
-        PyMem_Free(bytes);
-    }
-    return NULL;
+    *str = bytes2;
+    return 0;
 }