bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable
  and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
  UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
  _winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
  mode. As a side effect, open() now uses the UTF-8 encoding by
  default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
  in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
  enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
  return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
  always copy flag values, rather than only copying if the new value
  is greater than the old value.
This commit is contained in:
Victor Stinner 2017-12-13 12:29:09 +01:00 committed by GitHub
parent c3e070f849
commit 91106cd9ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
27 changed files with 598 additions and 183 deletions

View file

@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */
int Py_HasFileSystemDefaultEncoding = 0;
#endif
const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin
and stdout error handler to "surrogateescape". */
int Py_UTF8Mode = 0;
_Py_IDENTIFIER(__builtins__);
_Py_IDENTIFIER(__dict__);

View file

@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
#if defined(__APPLE__) || defined(__ANDROID__)
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
#endif
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
size_t *p_wlen);
#ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values:
@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
}
#endif
/* Decode a byte string from the locale encoding with the
surrogateescape error handler: undecodable bytes are decoded as characters
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
Return a pointer to a newly allocated wide character string, use
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Return NULL on decoding error or memory allocation error. If *size* is not
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
decoding error.
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
static wchar_t*
decode_locale(const char* arg, size_t *size)
{
#if defined(__APPLE__) || defined(__ANDROID__)
wchar_t *wstr;
wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
if (size != NULL) {
if (wstr != NULL)
*size = wcslen(wstr);
else
*size = (size_t)-1;
}
return wstr;
#else
wchar_t *res;
size_t argsize;
size_t count;
@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
mbstate_t mbs;
#endif
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
res = decode_ascii_surrogateescape(arg, size);
if (res == NULL)
goto oom;
return res;
}
#endif
#ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that
@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
goto oom;
#endif /* HAVE_MBRTOWC */
return res;
oom:
if (size != NULL)
if (size != NULL) {
*size = (size_t)-1;
}
return NULL;
}
/* Decode a byte string from the locale encoding with the
surrogateescape error handler: undecodable bytes are decoded as characters
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
Return a pointer to a newly allocated wide character string, use
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Return NULL on decoding error or memory allocation error. If *size* is not
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
decoding error.
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
{
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
#else
if (Py_UTF8Mode) {
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
}
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
if (wstr == NULL) {
if (size != NULL) {
*size = (size_t)-1;
}
return NULL;
}
return wstr;
}
#endif
return decode_locale(arg, size);
#endif /* __APPLE__ or __ANDROID__ */
}
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
the memory. Return NULL on encoding or memory allocation error.
If error_pos is not NULL, *error_pos is set to the index of the invalid
character on encoding error, or set to (size_t)-1 otherwise.
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
character string. */
char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
static char*
_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
{
#if defined(__APPLE__) || defined(__ANDROID__)
Py_ssize_t len;
PyObject *unicode, *bytes = NULL;
char *cpath;
unicode = PyUnicode_FromWideChar(text, wcslen(text));
if (unicode == NULL)
if (unicode == NULL) {
return NULL;
}
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Py_DECREF(unicode);
if (bytes == NULL) {
PyErr_Clear();
if (error_pos != NULL)
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
return NULL;
}
@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
if (cpath == NULL) {
PyErr_Clear();
Py_DECREF(bytes);
if (error_pos != NULL)
if (error_pos != NULL) {
*error_pos = (size_t)-1;
}
return NULL;
}
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
Py_DECREF(bytes);
return cpath;
#else /* __APPLE__ */
}
static char*
encode_locale(const wchar_t *text, size_t *error_pos)
{
const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL;
size_t i, size, converted;
wchar_t c, buf[2];
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos);
#endif
/* The function works in two steps:
1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */
@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
bytes = result;
}
return result;
}
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
the memory. Return NULL on encoding or memory allocation error.
If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
to the index of the invalid character on encoding error.
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
character string. */
char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
{
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeLocaleUTF8(text, error_pos);
#else /* __APPLE__ */
if (Py_UTF8Mode) {
return _Py_EncodeLocaleUTF8(text, error_pos);
}
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos);
#endif
return encode_locale(text, error_pos);
#endif /* __APPLE__ or __ANDROID__ */
}

View file

@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */
static _PyInitError add_main_module(PyInterpreterState *interp);
static _PyInitError initfsencoding(PyInterpreterState *interp);
static _PyInitError initsite(void);
static _PyInitError init_sys_streams(void);
static _PyInitError init_sys_streams(PyInterpreterState *interp);
static _PyInitError initsigs(void);
static void call_py_exitfuncs(void);
static void wait_for_thread_shutdown(void);
@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config)
return err;
}
err = init_sys_streams();
err = init_sys_streams(interp);
if (_Py_INIT_FAILED(err)) {
return err;
}
@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p)
return err;
}
err = init_sys_streams();
err = init_sys_streams(interp);
if (_Py_INIT_FAILED(err)) {
return err;
}
@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp)
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
}
#else
if (Py_FileSystemDefaultEncoding == NULL) {
if (Py_FileSystemDefaultEncoding == NULL &&
interp->core_config.utf8_mode)
{
Py_FileSystemDefaultEncoding = "utf-8";
Py_HasFileSystemDefaultEncoding = 1;
}
else if (Py_FileSystemDefaultEncoding == NULL) {
Py_FileSystemDefaultEncoding = get_locale_encoding();
if (Py_FileSystemDefaultEncoding == NULL) {
return _Py_INIT_ERR("Unable to get the locale encoding");
@ -1749,7 +1755,7 @@ error:
/* Initialize sys.stdin, stdout, stderr and builtins.open */
static _PyInitError
init_sys_streams(void)
init_sys_streams(PyInterpreterState *interp)
{
PyObject *iomod = NULL, *wrapper;
PyObject *bimod = NULL;
@ -1794,10 +1800,10 @@ init_sys_streams(void)
encoding = _Py_StandardStreamEncoding;
errors = _Py_StandardStreamErrors;
if (!encoding || !errors) {
pythonioencoding = Py_GETENV("PYTHONIOENCODING");
if (pythonioencoding) {
char *opt = Py_GETENV("PYTHONIOENCODING");
if (opt && opt[0] != '\0') {
char *err;
pythonioencoding = _PyMem_Strdup(pythonioencoding);
pythonioencoding = _PyMem_Strdup(opt);
if (pythonioencoding == NULL) {
PyErr_NoMemory();
goto error;
@ -1814,7 +1820,12 @@ init_sys_streams(void)
encoding = pythonioencoding;
}
}
if (!errors && !(pythonioencoding && *pythonioencoding)) {
else if (interp->core_config.utf8_mode) {
encoding = "utf-8";
errors = "surrogateescape";
}
if (!errors && !pythonioencoding) {
/* Choose the default error handler based on the current locale */
errors = get_default_standard_stream_error_handler();
}

View file

@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = {
{"hash_randomization", "-R"},
{"isolated", "-I"},
{"dev_mode", "-X dev"},
{"utf8_mode", "-X utf8"},
{0}
};
@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = {
"sys.flags", /* name */
flags__doc__, /* doc */
flags_fields, /* fields */
14
15
};
static PyObject*
@ -1853,8 +1854,9 @@ make_flags(void)
SetFlag(Py_QuietFlag);
SetFlag(Py_HashRandomizationFlag);
SetFlag(Py_IsolatedFlag);
#undef SetFlag
PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode));
SetFlag(Py_UTF8Mode);
#undef SetFlag
if (PyErr_Occurred()) {
Py_DECREF(seq);