bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable
  and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
  UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
  _winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
  mode. As a side effect, open() now uses the UTF-8 encoding by
  default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
  in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
  enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
  return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
  always copy flag values, rather than only copying if the new value
  is greater than the old value.
This commit is contained in:
Victor Stinner 2017-12-13 12:29:09 +01:00 committed by GitHub
parent c3e070f849
commit 91106cd9ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
27 changed files with 598 additions and 183 deletions

View file

@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#if defined(__APPLE__) || defined(__ANDROID__)
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
#endif
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
size_t *p_wlen);
#ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values:
@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
}
#endif
/* Decode a byte string from the locale encoding with the
surrogateescape error handler: undecodable bytes are decoded as characters
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
Return a pointer to a newly allocated wide character string, use
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Return NULL on decoding error or memory allocation error. If *size* is not
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
decoding error.
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
static wchar_t*
decode_locale(const char* arg, size_t *size)
{
#if defined(__APPLE__) || defined(__ANDROID__)
wchar_t *wstr;
wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
if (size != NULL) {
if (wstr != NULL)
*size = wcslen(wstr);
else
*size = (size_t)-1;
}
return wstr;
#else
wchar_t *res;
size_t argsize;
size_t count;
@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
mbstate_t mbs;
#endif
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
res = decode_ascii_surrogateescape(arg, size);
if (res == NULL)
goto oom;
return res;
}
#endif
#ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that
@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
goto oom;
#endif /* HAVE_MBRTOWC */
return res;
oom:
if (size != NULL)
if (size != NULL) {
*size = (size_t)-1;
}
return NULL;
}
/* Decode a byte string from the locale encoding with the
surrogateescape error handler: undecodable bytes are decoded as characters
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
Return a pointer to a newly allocated wide character string, use
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Return NULL on decoding error or memory allocation error. If *size* is not
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
decoding error.
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
{
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
#else
if (Py_UTF8Mode) {
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
}
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
if (wstr == NULL) {
if (size != NULL) {
*size = (size_t)-1;
}
return NULL;
}
return wstr;
}
#endif
return decode_locale(arg, size);
#endif /* __APPLE__ or __ANDROID__ */
}
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
the memory. Return NULL on encoding or memory allocation error.
If error_pos is not NULL, *error_pos is set to the index of the invalid
character on encoding error, or set to (size_t)-1 otherwise.
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
character string. */
char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
static char*
_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
{
#if defined(__APPLE__) || defined(__ANDROID__)
Py_ssize_t len;
PyObject *unicode, *bytes = NULL;
char *cpath;
unicode = PyUnicode_FromWideChar(text, wcslen(text));
if (unicode == NULL)
if (unicode == NULL) {
return NULL;
}
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Py_DECREF(unicode);
if (bytes == NULL) {
PyErr_Clear();
if (error_pos != NULL)
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
return NULL;
}
@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
if (cpath == NULL) {
PyErr_Clear();
Py_DECREF(bytes);
if (error_pos != NULL)
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
return NULL;
}
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
Py_DECREF(bytes);
return cpath;
#else /* __APPLE__ */
}
static char*
encode_locale(const wchar_t *text, size_t *error_pos)
{
const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL;
size_t i, size, converted;
wchar_t c, buf[2];
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos);
#endif
/* The function works in two steps:
1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */
@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
bytes = result;
}
return result;
}
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
the memory. Return NULL on encoding or memory allocation error.
If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
to the index of the invalid character on encoding error.
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
character string. */
char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
{
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeLocaleUTF8(text, error_pos);
#else /* __APPLE__ */
if (Py_UTF8Mode) {
return _Py_EncodeLocaleUTF8(text, error_pos);
}
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos);
#endif
return encode_locale(text, error_pos);
#endif /* __APPLE__ or __ANDROID__ */
}