mirror of
https://github.com/python/cpython.git
synced 2025-08-22 09:45:06 +00:00
bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
This commit is contained in:
parent
c3e070f849
commit
91106cd9ff
27 changed files with 598 additions and 183 deletions
|
@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
|
|||
#include <fcntl.h>
|
||||
#endif /* HAVE_FCNTL_H */
|
||||
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
|
||||
#endif
|
||||
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
|
||||
size_t *p_wlen);
|
||||
|
||||
#ifdef O_CLOEXEC
|
||||
/* Does open() support the O_CLOEXEC flag? Possible values:
|
||||
|
@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* Decode a byte string from the locale encoding with the
|
||||
surrogateescape error handler: undecodable bytes are decoded as characters
|
||||
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
|
||||
character, escape the bytes using the surrogateescape error handler instead
|
||||
of decoding them.
|
||||
|
||||
Return a pointer to a newly allocated wide character string, use
|
||||
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
|
||||
wide characters excluding the null character into *size
|
||||
|
||||
Return NULL on decoding error or memory allocation error. If *size* is not
|
||||
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
|
||||
decoding error.
|
||||
|
||||
Decoding errors should never happen, unless there is a bug in the C
|
||||
library.
|
||||
|
||||
Use the Py_EncodeLocale() function to encode the character string back to a
|
||||
byte string. */
|
||||
wchar_t*
|
||||
Py_DecodeLocale(const char* arg, size_t *size)
|
||||
static wchar_t*
|
||||
decode_locale(const char* arg, size_t *size)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
wchar_t *wstr;
|
||||
wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
|
||||
if (size != NULL) {
|
||||
if (wstr != NULL)
|
||||
*size = wcslen(wstr);
|
||||
else
|
||||
*size = (size_t)-1;
|
||||
}
|
||||
return wstr;
|
||||
#else
|
||||
wchar_t *res;
|
||||
size_t argsize;
|
||||
size_t count;
|
||||
|
@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
|
|||
mbstate_t mbs;
|
||||
#endif
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii) {
|
||||
/* force ASCII encoding to workaround mbstowcs() issue */
|
||||
res = decode_ascii_surrogateescape(arg, size);
|
||||
if (res == NULL)
|
||||
goto oom;
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
/* Some platforms have a broken implementation of
|
||||
* mbstowcs which does not count the characters that
|
||||
|
@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
|
|||
goto oom;
|
||||
#endif /* HAVE_MBRTOWC */
|
||||
return res;
|
||||
|
||||
oom:
|
||||
if (size != NULL)
|
||||
if (size != NULL) {
|
||||
*size = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Decode a byte string from the locale encoding with the
|
||||
surrogateescape error handler: undecodable bytes are decoded as characters
|
||||
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
|
||||
character, escape the bytes using the surrogateescape error handler instead
|
||||
of decoding them.
|
||||
|
||||
Return a pointer to a newly allocated wide character string, use
|
||||
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
|
||||
wide characters excluding the null character into *size
|
||||
|
||||
Return NULL on decoding error or memory allocation error. If *size* is not
|
||||
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
|
||||
decoding error.
|
||||
|
||||
Decoding errors should never happen, unless there is a bug in the C
|
||||
library.
|
||||
|
||||
Use the Py_EncodeLocale() function to encode the character string back to a
|
||||
byte string. */
|
||||
wchar_t*
|
||||
Py_DecodeLocale(const char* arg, size_t *size)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
|
||||
#else
|
||||
if (Py_UTF8Mode) {
|
||||
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
|
||||
}
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii) {
|
||||
/* force ASCII encoding to workaround mbstowcs() issue */
|
||||
wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
|
||||
if (wstr == NULL) {
|
||||
if (size != NULL) {
|
||||
*size = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
return wstr;
|
||||
}
|
||||
#endif
|
||||
|
||||
return decode_locale(arg, size);
|
||||
#endif /* __APPLE__ or __ANDROID__ */
|
||||
}
|
||||
|
||||
/* Encode a wide character string to the locale encoding with the
|
||||
surrogateescape error handler: surrogate characters in the range
|
||||
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
|
||||
|
||||
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
|
||||
the memory. Return NULL on encoding or memory allocation error.
|
||||
|
||||
If error_pos is not NULL, *error_pos is set to the index of the invalid
|
||||
character on encoding error, or set to (size_t)-1 otherwise.
|
||||
|
||||
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
|
||||
character string. */
|
||||
char*
|
||||
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
||||
static char*
|
||||
_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
Py_ssize_t len;
|
||||
PyObject *unicode, *bytes = NULL;
|
||||
char *cpath;
|
||||
|
||||
unicode = PyUnicode_FromWideChar(text, wcslen(text));
|
||||
if (unicode == NULL)
|
||||
if (unicode == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
|
||||
Py_DECREF(unicode);
|
||||
if (bytes == NULL) {
|
||||
PyErr_Clear();
|
||||
if (error_pos != NULL)
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
|||
if (cpath == NULL) {
|
||||
PyErr_Clear();
|
||||
Py_DECREF(bytes);
|
||||
if (error_pos != NULL)
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
|
||||
Py_DECREF(bytes);
|
||||
return cpath;
|
||||
#else /* __APPLE__ */
|
||||
}
|
||||
|
||||
static char*
|
||||
encode_locale(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
const size_t len = wcslen(text);
|
||||
char *result = NULL, *bytes = NULL;
|
||||
size_t i, size, converted;
|
||||
wchar_t c, buf[2];
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii)
|
||||
return encode_ascii_surrogateescape(text, error_pos);
|
||||
#endif
|
||||
|
||||
/* The function works in two steps:
|
||||
1. compute the length of the output buffer in bytes (size)
|
||||
2. outputs the bytes */
|
||||
|
@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
|||
bytes = result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Encode a wide character string to the locale encoding with the
|
||||
surrogateescape error handler: surrogate characters in the range
|
||||
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
|
||||
|
||||
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
|
||||
the memory. Return NULL on encoding or memory allocation error.
|
||||
|
||||
If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
|
||||
to the index of the invalid character on encoding error.
|
||||
|
||||
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
|
||||
character string. */
|
||||
char*
|
||||
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
return _Py_EncodeLocaleUTF8(text, error_pos);
|
||||
#else /* __APPLE__ */
|
||||
if (Py_UTF8Mode) {
|
||||
return _Py_EncodeLocaleUTF8(text, error_pos);
|
||||
}
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii)
|
||||
return encode_ascii_surrogateescape(text, error_pos);
|
||||
#endif
|
||||
|
||||
return encode_locale(text, error_pos);
|
||||
#endif /* __APPLE__ or __ANDROID__ */
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue