mirror of
https://github.com/python/cpython.git
synced 2025-08-29 05:05:03 +00:00
bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
This commit is contained in:
parent
c3e070f849
commit
91106cd9ff
27 changed files with 598 additions and 183 deletions
|
@ -5079,16 +5079,17 @@ onError:
|
|||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
|
||||
/* Simplified UTF-8 decoder using surrogateescape error handler,
|
||||
used to decode the command line arguments on Mac OS X and Android.
|
||||
/* UTF-8 decoder using the surrogateescape error handler .
|
||||
|
||||
Return a pointer to a newly allocated wide character string (use
|
||||
PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
|
||||
On success, return a pointer to a newly allocated wide character string (use
|
||||
PyMem_RawFree() to free the memory) and write the output length (in number
|
||||
of wchar_t units) into *p_wlen (if p_wlen is set).
|
||||
|
||||
On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
|
||||
(if p_wlen is set). */
|
||||
wchar_t*
|
||||
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
||||
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
||||
{
|
||||
const char *e;
|
||||
wchar_t *unicode;
|
||||
|
@ -5096,11 +5097,20 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
|||
|
||||
/* Note: size will always be longer than the resulting Unicode
|
||||
character count */
|
||||
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
|
||||
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
|
||||
if (p_wlen) {
|
||||
*p_wlen = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
|
||||
if (!unicode)
|
||||
if (!unicode) {
|
||||
if (p_wlen) {
|
||||
*p_wlen = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Unpack UTF-8 encoded data */
|
||||
e = s + size;
|
||||
|
@ -5130,10 +5140,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
|||
}
|
||||
}
|
||||
unicode[outpos] = L'\0';
|
||||
if (p_wlen) {
|
||||
*p_wlen = outpos;
|
||||
}
|
||||
return unicode;
|
||||
}
|
||||
|
||||
#endif /* __APPLE__ or __ANDROID__ */
|
||||
|
||||
/* Primary internal function which creates utf8 encoded bytes objects.
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue