bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable
  and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
  UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
  _winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
  mode. As a side effect, open() now uses the UTF-8 encoding by
  default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
  in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
  enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
  return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
  always copy flag values, rather than only copying if the new value
  is greater than the old value.
This commit is contained in:
Victor Stinner 2017-12-13 12:29:09 +01:00 committed by GitHub
parent c3e070f849
commit 91106cd9ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
27 changed files with 598 additions and 183 deletions

View file

@ -127,6 +127,9 @@ Operating System Utilities
.. versionadded:: 3.5 .. versionadded:: 3.5
.. versionchanged:: 3.7
The function now uses the UTF-8 encoding in the UTF-8 mode.
.. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos) .. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
@ -138,12 +141,15 @@ Operating System Utilities
to free the memory. Return ``NULL`` on encoding error or memory allocation to free the memory. Return ``NULL`` on encoding error or memory allocation
error error
If error_pos is not ``NULL``, ``*error_pos`` is set to the index of the If error_pos is not ``NULL``, ``*error_pos`` is set to ``(size_t)-1`` on
invalid character on encoding error, or set to ``(size_t)-1`` otherwise. success, or set to the index of the invalid character on encoding error.
Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back
to a wide character string. to a wide character string.
.. versionchanged:: 3.7
The function now uses the UTF-8 encoding in the UTF-8 mode.
.. seealso:: .. seealso::
The :c:func:`PyUnicode_EncodeFSDefault` and The :c:func:`PyUnicode_EncodeFSDefault` and
@ -151,6 +157,9 @@ Operating System Utilities
.. versionadded:: 3.5 .. versionadded:: 3.5
.. versionchanged:: 3.7
The function now supports the UTF-8 mode.
.. _systemfunctions: .. _systemfunctions:

View file

@ -316,6 +316,13 @@ The :mod:`locale` module defines the following exception and functions:
preferences, so this function is not thread-safe. If invoking setlocale is not preferences, so this function is not thread-safe. If invoking setlocale is not
necessary or desired, *do_setlocale* should be set to ``False``. necessary or desired, *do_setlocale* should be set to ``False``.
On Android or in the UTF-8 mode (:option:`-X` ``utf8`` option), always
return ``'UTF-8'``, the locale and the *do_setlocale* argument are ignored.
.. versionchanged:: 3.7
The function now always returns ``UTF-8`` on Android or if the UTF-8 mode
is enabled.
.. function:: normalize(localename) .. function:: normalize(localename)

View file

@ -313,6 +313,9 @@ always available.
has caught :exc:`SystemExit` (such as an error flushing buffered data has caught :exc:`SystemExit` (such as an error flushing buffered data
in the standard streams), the exit status is changed to 120. in the standard streams), the exit status is changed to 120.
.. versionchanged:: 3.7
Added ``utf8_mode`` attribute for the new :option:`-X` ``utf8`` flag.
.. data:: flags .. data:: flags
@ -335,6 +338,7 @@ always available.
:const:`quiet` :option:`-q` :const:`quiet` :option:`-q`
:const:`hash_randomization` :option:`-R` :const:`hash_randomization` :option:`-R`
:const:`dev_mode` :option:`-X` ``dev`` :const:`dev_mode` :option:`-X` ``dev``
:const:`utf8_mode` :option:`-X` ``utf8``
============================= ============================= ============================= =============================
.. versionchanged:: 3.2 .. versionchanged:: 3.2
@ -347,7 +351,8 @@ always available.
Removed obsolete ``division_warning`` attribute. Removed obsolete ``division_warning`` attribute.
.. versionchanged:: 3.7 .. versionchanged:: 3.7
Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag. Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag
and ``utf8_mode`` attribute for the new :option:`-X` ``utf8`` flag.
.. data:: float_info .. data:: float_info
@ -492,6 +497,8 @@ always available.
:func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that
the correct encoding and errors mode are used. the correct encoding and errors mode are used.
* In the UTF-8 mode, the encoding is ``utf-8`` on any platform.
* On Mac OS X, the encoding is ``'utf-8'``. * On Mac OS X, the encoding is ``'utf-8'``.
* On Unix, the encoding is the locale encoding. * On Unix, the encoding is the locale encoding.
@ -506,6 +513,10 @@ always available.
Windows is no longer guaranteed to return ``'mbcs'``. See :pep:`529` Windows is no longer guaranteed to return ``'mbcs'``. See :pep:`529`
and :func:`_enablelegacywindowsfsencoding` for more information. and :func:`_enablelegacywindowsfsencoding` for more information.
.. versionchanged:: 3.7
Return 'utf-8' in the UTF-8 mode.
.. function:: getfilesystemencodeerrors() .. function:: getfilesystemencodeerrors()
Return the name of the error mode used to convert between Unicode filenames Return the name of the error mode used to convert between Unicode filenames

View file

@ -439,6 +439,9 @@ Miscellaneous options
* Set the :attr:`~sys.flags.dev_mode` attribute of :attr:`sys.flags` to * Set the :attr:`~sys.flags.dev_mode` attribute of :attr:`sys.flags` to
``True`` ``True``
* ``-X utf8`` enables the UTF-8 mode, whereas ``-X utf8=0`` disables the
UTF-8 mode.
It also allows passing arbitrary values and retrieving them through the It also allows passing arbitrary values and retrieving them through the
:data:`sys._xoptions` dictionary. :data:`sys._xoptions` dictionary.
@ -455,7 +458,7 @@ Miscellaneous options
The ``-X showalloccount`` option. The ``-X showalloccount`` option.
.. versionadded:: 3.7 .. versionadded:: 3.7
The ``-X importtime`` and ``-X dev`` options. The ``-X importtime``, ``-X dev`` and ``-X utf8`` options.
Options you shouldn't use Options you shouldn't use
@ -816,6 +819,14 @@ conflict.
.. versionadded:: 3.7 .. versionadded:: 3.7
.. envvar:: PYTHONUTF8
If set to ``1``, enable the UTF-8 mode. If set to ``0``, disable the UTF-8
mode. Any other non-empty string cause an error.
.. versionadded:: 3.7
Debug-mode variables Debug-mode variables
~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~

View file

@ -185,6 +185,23 @@ resolution on Linux and Windows.
PEP written and implemented by Victor Stinner PEP written and implemented by Victor Stinner
PEP 540: Add a new UTF-8 mode
-----------------------------
Add a new UTF-8 mode to ignore the locale, use the UTF-8 encoding, and change
:data:`sys.stdin` and :data:`sys.stdout` error handlers to ``surrogateescape``.
This mode is enabled by default in the POSIX locale, but otherwise disabled by
default.
The new :option:`-X` ``utf8`` command line option and :envvar:`PYTHONUTF8`
environment variable are added to control the UTF-8 mode.
.. seealso::
:pep:`540` -- Add a new UTF-8 mode
PEP written and implemented by Victor Stinner
New Development Mode: -X dev New Development Mode: -X dev
---------------------------- ----------------------------
@ -353,6 +370,10 @@ Added another argument *monetary* in :meth:`format_string` of :mod:`locale`.
If *monetary* is true, the conversion uses monetary thousands separator and If *monetary* is true, the conversion uses monetary thousands separator and
grouping strings. (Contributed by Garvit in :issue:`10379`.) grouping strings. (Contributed by Garvit in :issue:`10379`.)
The :func:`locale.getpreferredencoding` function now always returns ``'UTF-8'``
on Android or in the UTF-8 mode (:option:`-X` ``utf8`` option), the locale and
the *do_setlocale* argument are ignored.
math math
---- ----

View file

@ -28,6 +28,10 @@ PyAPI_DATA(const char *) Py_FileSystemDefaultEncodeErrors;
#endif #endif
PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding; PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding;
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
PyAPI_DATA(int) Py_UTF8Mode;
#endif
/* Internal API /* Internal API
The std printer acts as a preliminary sys.stderr until the new io The std printer acts as a preliminary sys.stderr until the new io

View file

@ -38,6 +38,7 @@ typedef struct {
int show_alloc_count; /* -X showalloccount */ int show_alloc_count; /* -X showalloccount */
int dump_refs; /* PYTHONDUMPREFS */ int dump_refs; /* PYTHONDUMPREFS */
int malloc_stats; /* PYTHONMALLOCSTATS */ int malloc_stats; /* PYTHONMALLOCSTATS */
int utf8_mode; /* -X utf8 or PYTHONUTF8 environment variable */
} _PyCoreConfig; } _PyCoreConfig;
#define _PyCoreConfig_INIT (_PyCoreConfig){.use_hash_seed = -1} #define _PyCoreConfig_INIT (_PyCoreConfig){.use_hash_seed = -1}

View file

@ -9,6 +9,8 @@ import _locale
if sys.platform.startswith("win"): if sys.platform.startswith("win"):
def getpreferredencoding(do_setlocale=True): def getpreferredencoding(do_setlocale=True):
if sys.flags.utf8_mode:
return 'UTF-8'
return _locale._getdefaultlocale()[1] return _locale._getdefaultlocale()[1]
else: else:
try: try:
@ -21,6 +23,8 @@ else:
return 'UTF-8' return 'UTF-8'
else: else:
def getpreferredencoding(do_setlocale=True): def getpreferredencoding(do_setlocale=True):
if sys.flags.utf8_mode:
return 'UTF-8'
# This path for legacy systems needs the more complex # This path for legacy systems needs the more complex
# getdefaultlocale() function, import the full locale module. # getdefaultlocale() function, import the full locale module.
import locale import locale
@ -28,6 +32,8 @@ else:
else: else:
def getpreferredencoding(do_setlocale=True): def getpreferredencoding(do_setlocale=True):
assert not do_setlocale assert not do_setlocale
if sys.flags.utf8_mode:
return 'UTF-8'
result = _locale.nl_langinfo(_locale.CODESET) result = _locale.nl_langinfo(_locale.CODESET)
if not result and sys.platform == 'darwin': if not result and sys.platform == 'darwin':
# nl_langinfo can return an empty string # nl_langinfo can return an empty string

View file

@ -158,8 +158,9 @@ codecs.register(search_function)
if sys.platform == 'win32': if sys.platform == 'win32':
def _alias_mbcs(encoding): def _alias_mbcs(encoding):
try: try:
import _bootlocale import _winapi
if encoding == _bootlocale.getpreferredencoding(False): ansi_code_page = "cp%s" % _winapi.GetACP()
if encoding == ansi_code_page:
import encodings.mbcs import encodings.mbcs
return encodings.mbcs.getregentry() return encodings.mbcs.getregentry()
except ImportError: except ImportError:

View file

@ -617,6 +617,8 @@ if sys.platform.startswith("win"):
# On Win32, this will return the ANSI code page # On Win32, this will return the ANSI code page
def getpreferredencoding(do_setlocale = True): def getpreferredencoding(do_setlocale = True):
"""Return the charset that the user is likely using.""" """Return the charset that the user is likely using."""
if sys.flags.utf8_mode:
return 'UTF-8'
import _bootlocale import _bootlocale
return _bootlocale.getpreferredencoding(False) return _bootlocale.getpreferredencoding(False)
else: else:
@ -634,6 +636,8 @@ else:
def getpreferredencoding(do_setlocale = True): def getpreferredencoding(do_setlocale = True):
"""Return the charset that the user is likely using, """Return the charset that the user is likely using,
by looking at environment variables.""" by looking at environment variables."""
if sys.flags.utf8_mode:
return 'UTF-8'
res = getdefaultlocale()[1] res = getdefaultlocale()[1]
if res is None: if res is None:
# LANG not set, default conservatively to ASCII # LANG not set, default conservatively to ASCII
@ -643,6 +647,8 @@ else:
def getpreferredencoding(do_setlocale = True): def getpreferredencoding(do_setlocale = True):
"""Return the charset that the user is likely using, """Return the charset that the user is likely using,
according to the system configuration.""" according to the system configuration."""
if sys.flags.utf8_mode:
return 'UTF-8'
import _bootlocale import _bootlocale
if do_setlocale: if do_setlocale:
oldloc = setlocale(LC_CTYPE) oldloc = setlocale(LC_CTYPE)

View file

@ -280,7 +280,7 @@ def _args_from_interpreter_flags():
if dev_mode: if dev_mode:
args.extend(('-X', 'dev')) args.extend(('-X', 'dev'))
for opt in ('faulthandler', 'tracemalloc', 'importtime', for opt in ('faulthandler', 'tracemalloc', 'importtime',
'showalloccount', 'showrefcount'): 'showalloccount', 'showrefcount', 'utf8'):
if opt in xoptions: if opt in xoptions:
value = xoptions[opt] value = xoptions[opt]
if value is True: if value is True:

View file

@ -1022,6 +1022,7 @@ class BuiltinTest(unittest.TestCase):
self.assertRaises(ValueError, open, 'a\x00b') self.assertRaises(ValueError, open, 'a\x00b')
self.assertRaises(ValueError, open, b'a\x00b') self.assertRaises(ValueError, open, b'a\x00b')
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
def test_open_default_encoding(self): def test_open_default_encoding(self):
old_environ = dict(os.environ) old_environ = dict(os.environ)
try: try:

View file

@ -130,7 +130,7 @@ class EncodingDetails(_EncodingDetails):
that. that.
""" """
result, py_cmd = run_python_until_end( result, py_cmd = run_python_until_end(
"-c", cls.CHILD_PROCESS_SCRIPT, "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
__isolated=True, __isolated=True,
**env_vars **env_vars
) )

View file

@ -5,6 +5,7 @@ import locale
import sys import sys
import unittest import unittest
import encodings import encodings
from unittest import mock
from test import support from test import support
@ -3180,16 +3181,9 @@ class CodePageTest(unittest.TestCase):
def test_mbcs_alias(self): def test_mbcs_alias(self):
# Check that looking up our 'default' codepage will return # Check that looking up our 'default' codepage will return
# mbcs when we don't have a more specific one available # mbcs when we don't have a more specific one available
import _bootlocale with mock.patch('_winapi.GetACP', return_value=123):
def _get_fake_codepage(*a):
return 'cp123'
old_getpreferredencoding = _bootlocale.getpreferredencoding
_bootlocale.getpreferredencoding = _get_fake_codepage
try:
codec = codecs.lookup('cp123') codec = codecs.lookup('cp123')
self.assertEqual(codec.name, 'mbcs') self.assertEqual(codec.name, 'mbcs')
finally:
_bootlocale.getpreferredencoding = old_getpreferredencoding
class ASCIITest(unittest.TestCase): class ASCIITest(unittest.TestCase):

View file

@ -2580,6 +2580,7 @@ class TextIOWrapperTest(unittest.TestCase):
t.reconfigure(line_buffering=None) t.reconfigure(line_buffering=None)
self.assertEqual(t.line_buffering, True) self.assertEqual(t.line_buffering, True)
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
def test_default_encoding(self): def test_default_encoding(self):
old_environ = dict(os.environ) old_environ = dict(os.environ)
try: try:
@ -2599,6 +2600,7 @@ class TextIOWrapperTest(unittest.TestCase):
os.environ.update(old_environ) os.environ.update(old_environ)
@support.cpython_only @support.cpython_only
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
def test_device_encoding(self): def test_device_encoding(self):
# Issue 15989 # Issue 15989
import _testcapi import _testcapi

View file

@ -527,7 +527,7 @@ class SysModuleTest(unittest.TestCase):
"inspect", "interactive", "optimize", "dont_write_bytecode", "inspect", "interactive", "optimize", "dont_write_bytecode",
"no_user_site", "no_site", "ignore_environment", "verbose", "no_user_site", "no_site", "ignore_environment", "verbose",
"bytes_warning", "quiet", "hash_randomization", "isolated", "bytes_warning", "quiet", "hash_randomization", "isolated",
"dev_mode") "dev_mode", "utf8_mode")
for attr in attrs: for attr in attrs:
self.assertTrue(hasattr(sys.flags, attr), attr) self.assertTrue(hasattr(sys.flags, attr), attr)
attr_type = bool if attr == "dev_mode" else int attr_type = bool if attr == "dev_mode" else int
@ -535,6 +535,8 @@ class SysModuleTest(unittest.TestCase):
self.assertTrue(repr(sys.flags)) self.assertTrue(repr(sys.flags))
self.assertEqual(len(sys.flags), len(attrs)) self.assertEqual(len(sys.flags), len(attrs))
self.assertIn(sys.flags.utf8_mode, {0, 1, 2})
def assert_raise_on_new_sys_type(self, sys_attr): def assert_raise_on_new_sys_type(self, sys_attr):
# Users are intentionally prevented from creating new instances of # Users are intentionally prevented from creating new instances of
# sys.flags, sys.version_info, and sys.getwindowsversion. # sys.flags, sys.version_info, and sys.getwindowsversion.
@ -710,8 +712,8 @@ class SysModuleTest(unittest.TestCase):
# have no any effect # have no any effect
out = self.c_locale_get_error_handler(encoding=':') out = self.c_locale_get_error_handler(encoding=':')
self.assertEqual(out, self.assertEqual(out,
'stdin: surrogateescape\n' 'stdin: strict\n'
'stdout: surrogateescape\n' 'stdout: strict\n'
'stderr: backslashreplace\n') 'stderr: backslashreplace\n')
out = self.c_locale_get_error_handler(encoding='') out = self.c_locale_get_error_handler(encoding='')
self.assertEqual(out, self.assertEqual(out,

206
Lib/test/test_utf8_mode.py Normal file
View file

@ -0,0 +1,206 @@
"""
Test the implementation of the PEP 540: the UTF-8 Mode.
"""
import locale
import os
import sys
import textwrap
import unittest
from test.support.script_helper import assert_python_ok, assert_python_failure
MS_WINDOWS = (sys.platform == 'win32')
class UTF8ModeTests(unittest.TestCase):
# Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment
# variables by default
DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''}
def posix_locale(self):
loc = locale.setlocale(locale.LC_CTYPE, None)
return (loc == 'C')
def get_output(self, *args, failure=False, **kw):
kw = dict(self.DEFAULT_ENV, **kw)
if failure:
out = assert_python_failure(*args, **kw)
out = out[2]
else:
out = assert_python_ok(*args, **kw)
out = out[1]
return out.decode().rstrip("\n\r")
@unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
def test_posix_locale(self):
code = 'import sys; print(sys.flags.utf8_mode)'
out = self.get_output('-c', code, LC_ALL='C')
self.assertEqual(out, '1')
def test_xoption(self):
code = 'import sys; print(sys.flags.utf8_mode)'
out = self.get_output('-X', 'utf8', '-c', code)
self.assertEqual(out, '1')
# undocumented but accepted syntax: -X utf8=1
out = self.get_output('-X', 'utf8=1', '-c', code)
self.assertEqual(out, '1')
out = self.get_output('-X', 'utf8=0', '-c', code)
self.assertEqual(out, '0')
if MS_WINDOWS:
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8
# and has the priority over -X utf8
out = self.get_output('-X', 'utf8', '-c', code,
PYTHONLEGACYWINDOWSFSENCODING='1')
self.assertEqual(out, '0')
def test_env_var(self):
code = 'import sys; print(sys.flags.utf8_mode)'
out = self.get_output('-c', code, PYTHONUTF8='1')
self.assertEqual(out, '1')
out = self.get_output('-c', code, PYTHONUTF8='0')
self.assertEqual(out, '0')
# -X utf8 has the priority over PYTHONUTF8
out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
self.assertEqual(out, '0')
if MS_WINDOWS:
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
# and has the priority over PYTHONUTF8
out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
PYTHONLEGACYWINDOWSFSENCODING='1')
self.assertEqual(out, '0')
# Cannot test with the POSIX locale, since the POSIX locale enables
# the UTF-8 mode
if not self.posix_locale():
# PYTHONUTF8 should be ignored if -E is used
out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
self.assertEqual(out, '0')
# invalid mode
out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
self.assertIn('invalid PYTHONUTF8 environment variable value',
out.rstrip())
def test_filesystemencoding(self):
code = textwrap.dedent('''
import sys
print("{}/{}".format(sys.getfilesystemencoding(),
sys.getfilesystemencodeerrors()))
''')
if MS_WINDOWS:
expected = 'utf-8/surrogatepass'
else:
expected = 'utf-8/surrogateescape'
out = self.get_output('-X', 'utf8', '-c', code)
self.assertEqual(out, expected)
if MS_WINDOWS:
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
# and has the priority over -X utf8 and PYTHONUTF8
out = self.get_output('-X', 'utf8', '-c', code,
PYTHONUTF8='strict',
PYTHONLEGACYWINDOWSFSENCODING='1')
self.assertEqual(out, 'mbcs/replace')
def test_stdio(self):
code = textwrap.dedent('''
import sys
print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
''')
out = self.get_output('-X', 'utf8', '-c', code,
PYTHONIOENCODING='')
self.assertEqual(out.splitlines(),
['stdin: utf-8/surrogateescape',
'stdout: utf-8/surrogateescape',
'stderr: utf-8/backslashreplace'])
# PYTHONIOENCODING has the priority over PYTHONUTF8
out = self.get_output('-X', 'utf8', '-c', code,
PYTHONIOENCODING="latin1")
self.assertEqual(out.splitlines(),
['stdin: latin1/strict',
'stdout: latin1/strict',
'stderr: latin1/backslashreplace'])
out = self.get_output('-X', 'utf8', '-c', code,
PYTHONIOENCODING=":namereplace")
self.assertEqual(out.splitlines(),
['stdin: UTF-8/namereplace',
'stdout: UTF-8/namereplace',
'stderr: UTF-8/backslashreplace'])
def test_io(self):
code = textwrap.dedent('''
import sys
filename = sys.argv[1]
with open(filename) as fp:
print(f"{fp.encoding}/{fp.errors}")
''')
filename = __file__
out = self.get_output('-c', code, filename, PYTHONUTF8='1')
self.assertEqual(out, 'UTF-8/strict')
def _check_io_encoding(self, module, encoding=None, errors=None):
filename = __file__
# Encoding explicitly set
args = []
if encoding:
args.append(f'encoding={encoding!r}')
if errors:
args.append(f'errors={errors!r}')
code = textwrap.dedent('''
import sys
from %s import open
filename = sys.argv[1]
with open(filename, %s) as fp:
print(f"{fp.encoding}/{fp.errors}")
''') % (module, ', '.join(args))
out = self.get_output('-c', code, filename,
PYTHONUTF8='1')
if not encoding:
encoding = 'UTF-8'
if not errors:
errors = 'strict'
self.assertEqual(out, f'{encoding}/{errors}')
def check_io_encoding(self, module):
self._check_io_encoding(module, encoding="latin1")
self._check_io_encoding(module, errors="namereplace")
self._check_io_encoding(module,
encoding="latin1", errors="namereplace")
def test_io_encoding(self):
self.check_io_encoding('io')
def test_io_encoding(self):
self.check_io_encoding('_pyio')
def test_locale_getpreferredencoding(self):
code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
out = self.get_output('-X', 'utf8', '-c', code)
self.assertEqual(out, 'UTF-8 UTF-8')
out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C')
self.assertEqual(out, 'UTF-8 UTF-8')
if __name__ == "__main__":
unittest.main()

View file

@ -0,0 +1 @@
Add a new UTF-8 mode: implementation of the :pep:`540`.

View file

@ -1490,6 +1490,20 @@ _winapi_WriteFile_impl(PyObject *module, HANDLE handle, PyObject *buffer,
} }
/*[clinic input]
_winapi.GetACP
Get the current Windows ANSI code page identifier.
[clinic start generated code]*/
static PyObject *
_winapi_GetACP_impl(PyObject *module)
/*[clinic end generated code: output=f7ee24bf705dbb88 input=1433c96d03a05229]*/
{
return PyLong_FromUnsignedLong(GetACP());
}
static PyMethodDef winapi_functions[] = { static PyMethodDef winapi_functions[] = {
_WINAPI_CLOSEHANDLE_METHODDEF _WINAPI_CLOSEHANDLE_METHODDEF
_WINAPI_CONNECTNAMEDPIPE_METHODDEF _WINAPI_CONNECTNAMEDPIPE_METHODDEF
@ -1515,6 +1529,7 @@ static PyMethodDef winapi_functions[] = {
_WINAPI_WAITFORMULTIPLEOBJECTS_METHODDEF _WINAPI_WAITFORMULTIPLEOBJECTS_METHODDEF
_WINAPI_WAITFORSINGLEOBJECT_METHODDEF _WINAPI_WAITFORSINGLEOBJECT_METHODDEF
_WINAPI_WRITEFILE_METHODDEF _WINAPI_WRITEFILE_METHODDEF
_WINAPI_GETACP_METHODDEF
{NULL, NULL} {NULL, NULL}
}; };

View file

@ -889,4 +889,22 @@ _winapi_WriteFile(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject
exit: exit:
return return_value; return return_value;
} }
/*[clinic end generated code: output=fba2ad7bf1a87e4a input=a9049054013a1b77]*/
PyDoc_STRVAR(_winapi_GetACP__doc__,
"GetACP($module, /)\n"
"--\n"
"\n"
"Get the current Windows ANSI code page identifier.");
#define _WINAPI_GETACP_METHODDEF \
{"GetACP", (PyCFunction)_winapi_GetACP, METH_NOARGS, _winapi_GetACP__doc__},
static PyObject *
_winapi_GetACP_impl(PyObject *module);
static PyObject *
_winapi_GetACP(PyObject *module, PyObject *Py_UNUSED(ignored))
{
return _winapi_GetACP_impl(module);
}
/*[clinic end generated code: output=fd91c1ec286f0bf3 input=a9049054013a1b77]*/

View file

@ -1114,50 +1114,32 @@ pymain_set_argv(_PyMain *pymain)
} }
static void
pymain_get_flag(int flag, int *value)
{
if (flag) {
*value = flag;
}
}
static void
pymain_set_flag(int *flag, int value)
{
/* Helper to set flag variables from command line options
* - uses the higher of the two values if they're both set
* - otherwise leaves the flag unset
*/
if (*flag < value) {
*flag = value;
}
}
/* Get Py_xxx global configuration variables */ /* Get Py_xxx global configuration variables */
static void static void
pymain_get_global_config(_PyMain *pymain) pymain_get_global_config(_PyMain *pymain)
{ {
_Py_CommandLineDetails *cmdline = &pymain->cmdline; _Py_CommandLineDetails *cmdline = &pymain->cmdline;
pymain_get_flag(Py_BytesWarningFlag, &cmdline->bytes_warning);
pymain_get_flag(Py_DebugFlag, &cmdline->debug);
pymain_get_flag(Py_InspectFlag, &cmdline->inspect);
pymain_get_flag(Py_InteractiveFlag, &cmdline->interactive);
pymain_get_flag(Py_IsolatedFlag, &cmdline->isolated);
pymain_get_flag(Py_OptimizeFlag, &cmdline->optimization_level);
pymain_get_flag(Py_DontWriteBytecodeFlag, &cmdline->dont_write_bytecode);
pymain_get_flag(Py_NoUserSiteDirectory, &cmdline->no_user_site_directory);
pymain_get_flag(Py_NoSiteFlag, &cmdline->no_site_import);
pymain_get_flag(Py_UnbufferedStdioFlag, &cmdline->use_unbuffered_io);
pymain_get_flag(Py_VerboseFlag, &cmdline->verbosity);
pymain_get_flag(Py_QuietFlag, &cmdline->quiet_flag);
#ifdef MS_WINDOWS
pymain_get_flag(Py_LegacyWindowsFSEncodingFlag, &cmdline->legacy_windows_fs_encoding);
pymain_get_flag(Py_LegacyWindowsStdioFlag, &cmdline->legacy_windows_stdio);
#endif
pymain_get_flag(Py_IgnoreEnvironmentFlag, &pymain->core_config.ignore_environment); cmdline->bytes_warning = Py_BytesWarningFlag;
cmdline->debug = Py_DebugFlag;
cmdline->inspect = Py_InspectFlag;
cmdline->interactive = Py_InteractiveFlag;
cmdline->isolated = Py_IsolatedFlag;
cmdline->optimization_level = Py_OptimizeFlag;
cmdline->dont_write_bytecode = Py_DontWriteBytecodeFlag;
cmdline->no_user_site_directory = Py_NoUserSiteDirectory;
cmdline->no_site_import = Py_NoSiteFlag;
cmdline->use_unbuffered_io = Py_UnbufferedStdioFlag;
cmdline->verbosity = Py_VerboseFlag;
cmdline->quiet_flag = Py_QuietFlag;
#ifdef MS_WINDOWS
cmdline->legacy_windows_fs_encoding = Py_LegacyWindowsFSEncodingFlag;
cmdline->legacy_windows_stdio = Py_LegacyWindowsStdioFlag;
#endif
cmdline->check_hash_pycs_mode = _Py_CheckHashBasedPycsMode ;
pymain->core_config.ignore_environment = Py_IgnoreEnvironmentFlag;
pymain->core_config.utf8_mode = Py_UTF8Mode;
} }
@ -1166,26 +1148,27 @@ static void
pymain_set_global_config(_PyMain *pymain) pymain_set_global_config(_PyMain *pymain)
{ {
_Py_CommandLineDetails *cmdline = &pymain->cmdline; _Py_CommandLineDetails *cmdline = &pymain->cmdline;
pymain_set_flag(&Py_BytesWarningFlag, cmdline->bytes_warning);
pymain_set_flag(&Py_DebugFlag, cmdline->debug); Py_BytesWarningFlag = cmdline->bytes_warning;
pymain_set_flag(&Py_InspectFlag, cmdline->inspect); Py_DebugFlag = cmdline->debug;
pymain_set_flag(&Py_InteractiveFlag, cmdline->interactive); Py_InspectFlag = cmdline->inspect;
pymain_set_flag(&Py_IsolatedFlag, cmdline->isolated); Py_InteractiveFlag = cmdline->interactive;
pymain_set_flag(&Py_OptimizeFlag, cmdline->optimization_level); Py_IsolatedFlag = cmdline->isolated;
pymain_set_flag(&Py_DontWriteBytecodeFlag, cmdline->dont_write_bytecode); Py_OptimizeFlag = cmdline->optimization_level;
pymain_set_flag(&Py_NoUserSiteDirectory, cmdline->no_user_site_directory); Py_DontWriteBytecodeFlag = cmdline->dont_write_bytecode;
pymain_set_flag(&Py_NoSiteFlag, cmdline->no_site_import); Py_NoUserSiteDirectory = cmdline->no_user_site_directory;
pymain_set_flag(&Py_UnbufferedStdioFlag, cmdline->use_unbuffered_io); Py_NoSiteFlag = cmdline->no_site_import;
pymain_set_flag(&Py_VerboseFlag, cmdline->verbosity); Py_UnbufferedStdioFlag = cmdline->use_unbuffered_io;
pymain_set_flag(&Py_QuietFlag, cmdline->quiet_flag); Py_VerboseFlag = cmdline->verbosity;
if (cmdline->check_hash_pycs_mode) Py_QuietFlag = cmdline->quiet_flag;
_Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode; _Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
#ifdef MS_WINDOWS #ifdef MS_WINDOWS
pymain_set_flag(&Py_LegacyWindowsFSEncodingFlag, cmdline->legacy_windows_fs_encoding); Py_LegacyWindowsFSEncodingFlag = cmdline->legacy_windows_fs_encoding;
pymain_set_flag(&Py_LegacyWindowsStdioFlag, cmdline->legacy_windows_stdio); Py_LegacyWindowsStdioFlag = cmdline->legacy_windows_stdio;
#endif #endif
pymain_set_flag(&Py_IgnoreEnvironmentFlag, pymain->core_config.ignore_environment); Py_IgnoreEnvironmentFlag = pymain->core_config.ignore_environment;
Py_UTF8Mode = pymain->core_config.utf8_mode;
} }
@ -1609,6 +1592,57 @@ _PyMainInterpreterConfig_ReadEnv(_PyMainInterpreterConfig *config)
} }
static int
pymain_init_utf8_mode(_PyMain *pymain)
{
_PyCoreConfig *core_config = &pymain->core_config;
#ifdef MS_WINDOWS
if (pymain->cmdline.legacy_windows_fs_encoding) {
core_config->utf8_mode = 0;
return 0;
}
#endif
wchar_t *xopt = pymain_get_xoption(pymain, L"utf8");
if (xopt) {
wchar_t *sep = wcschr(xopt, L'=');
if (sep) {
xopt = sep + 1;
if (wcscmp(xopt, L"1") == 0) {
core_config->utf8_mode = 1;
}
else if (wcscmp(xopt, L"0") == 0) {
core_config->utf8_mode = 0;
}
else {
pymain->err = _Py_INIT_USER_ERR("invalid -X utf8 option value");
return -1;
}
}
else {
core_config->utf8_mode = 1;
}
return 0;
}
char *opt = pymain_get_env_var("PYTHONUTF8");
if (opt) {
if (strcmp(opt, "1") == 0) {
core_config->utf8_mode = 1;
}
else if (strcmp(opt, "0") == 0) {
core_config->utf8_mode = 0;
}
else {
pymain->err = _Py_INIT_USER_ERR("invalid PYTHONUTF8 environment "
"variable value");
return -1;
}
return 0;
}
return 0;
}
static int static int
@ -1674,6 +1708,9 @@ pymain_parse_envvars(_PyMain *pymain)
pymain->core_config.malloc_stats = 1; pymain->core_config.malloc_stats = 1;
} }
if (pymain_init_utf8_mode(pymain) < 0) {
return -1;
}
return 0; return 0;
} }
@ -1702,6 +1739,7 @@ pymain_parse_cmdline_envvars_impl(_PyMain *pymain)
if (pymain_parse_envvars(pymain) < 0) { if (pymain_parse_envvars(pymain) < 0) {
return -1; return -1;
} }
/* FIXME: if utf8_mode value changed, parse again cmdline */
_PyInitError err = _PyMainInterpreterConfig_Read(&pymain->config); _PyInitError err = _PyMainInterpreterConfig_Read(&pymain->config);
if (_Py_INIT_FAILED(err)) { if (_Py_INIT_FAILED(err)) {
@ -1730,6 +1768,7 @@ pymain_parse_cmdline_envvars(_PyMain *pymain)
static int static int
pymain_init_python(_PyMain *pymain) pymain_init_python(_PyMain *pymain)
{ {
pymain_set_global_config(pymain); pymain_set_global_config(pymain);
pymain_init_stdio(pymain); pymain_init_stdio(pymain);
@ -1788,6 +1827,7 @@ pymain_init(_PyMain *pymain)
return -1; return -1;
} }
pymain->core_config.utf8_mode = Py_UTF8Mode;
pymain->core_config._disable_importlib = 0; pymain->core_config._disable_importlib = 0;
pymain->config.install_signal_handlers = 1; pymain->config.install_signal_handlers = 1;

View file

@ -5079,16 +5079,17 @@ onError:
return NULL; return NULL;
} }
#if defined(__APPLE__) || defined(__ANDROID__)
/* Simplified UTF-8 decoder using surrogateescape error handler, /* UTF-8 decoder using the surrogateescape error handler .
used to decode the command line arguments on Mac OS X and Android.
Return a pointer to a newly allocated wide character string (use On success, return a pointer to a newly allocated wide character string (use
PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ PyMem_RawFree() to free the memory) and write the output length (in number
of wchar_t units) into *p_wlen (if p_wlen is set).
On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
(if p_wlen is set). */
wchar_t* wchar_t*
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
{ {
const char *e; const char *e;
wchar_t *unicode; wchar_t *unicode;
@ -5096,11 +5097,20 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
/* Note: size will always be longer than the resulting Unicode /* Note: size will always be longer than the resulting Unicode
character count */ character count */
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
if (p_wlen) {
*p_wlen = (size_t)-1;
}
return NULL; return NULL;
}
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
if (!unicode) if (!unicode) {
if (p_wlen) {
*p_wlen = (size_t)-1;
}
return NULL; return NULL;
}
/* Unpack UTF-8 encoded data */ /* Unpack UTF-8 encoded data */
e = s + size; e = s + size;
@ -5130,10 +5140,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
} }
} }
unicode[outpos] = L'\0'; unicode[outpos] = L'\0';
if (p_wlen) {
*p_wlen = outpos;
}
return unicode; return unicode;
} }
#endif /* __APPLE__ or __ANDROID__ */
/* Primary internal function which creates utf8 encoded bytes objects. /* Primary internal function which creates utf8 encoded bytes objects.

View file

@ -17,6 +17,15 @@ wmain(int argc, wchar_t **argv)
#else #else
static void _Py_NO_RETURN
fatal_error(const char *msg)
{
fprintf(stderr, "Fatal Python error: %s\n", msg);
fflush(stderr);
exit(1);
}
int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
@ -28,9 +37,7 @@ main(int argc, char **argv)
_PyInitError err = _PyRuntime_Initialize(); _PyInitError err = _PyRuntime_Initialize();
if (_Py_INIT_FAILED(err)) { if (_Py_INIT_FAILED(err)) {
fprintf(stderr, "Fatal Python error: %s\n", err.msg); fatal_error(err.msg);
fflush(stderr);
exit(1);
} }
/* Force default allocator, to be able to release memory above /* Force default allocator, to be able to release memory above
@ -40,7 +47,7 @@ main(int argc, char **argv)
argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
if (!argv_copy || !argv_copy2) { if (!argv_copy || !argv_copy2) {
fprintf(stderr, "out of memory\n"); fatal_error("out of memory");
return 1; return 1;
} }
@ -55,7 +62,7 @@ main(int argc, char **argv)
oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL)); oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL));
if (!oldloc) { if (!oldloc) {
fprintf(stderr, "out of memory\n"); fatal_error("out of memory");
return 1; return 1;
} }
@ -73,6 +80,7 @@ main(int argc, char **argv)
* details. * details.
*/ */
if (_Py_LegacyLocaleDetected()) { if (_Py_LegacyLocaleDetected()) {
Py_UTF8Mode = 1;
_Py_CoerceLegacyLocale(); _Py_CoerceLegacyLocale();
} }
@ -81,10 +89,7 @@ main(int argc, char **argv)
argv_copy[i] = Py_DecodeLocale(argv[i], NULL); argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
if (!argv_copy[i]) { if (!argv_copy[i]) {
PyMem_RawFree(oldloc); PyMem_RawFree(oldloc);
fprintf(stderr, "Fatal Python error: " fatal_error("unable to decode the command line arguments");
"unable to decode the command line argument #%i\n",
i + 1);
return 1;
} }
argv_copy2[i] = argv_copy[i]; argv_copy2[i] = argv_copy[i];
} }

View file

@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */
int Py_HasFileSystemDefaultEncoding = 0; int Py_HasFileSystemDefaultEncoding = 0;
#endif #endif
const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape"; const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin
and stdout error handler to "surrogateescape". */
int Py_UTF8Mode = 0;
_Py_IDENTIFIER(__builtins__); _Py_IDENTIFIER(__builtins__);
_Py_IDENTIFIER(__dict__); _Py_IDENTIFIER(__dict__);

View file

@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
#include <fcntl.h> #include <fcntl.h>
#endif /* HAVE_FCNTL_H */ #endif /* HAVE_FCNTL_H */
#if defined(__APPLE__) || defined(__ANDROID__) extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); size_t *p_wlen);
#endif
#ifdef O_CLOEXEC #ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values: /* Does open() support the O_CLOEXEC flag? Possible values:
@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
} }
#endif #endif
static wchar_t*
/* Decode a byte string from the locale encoding with the decode_locale(const char* arg, size_t *size)
surrogateescape error handler: undecodable bytes are decoded as characters
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
Return a pointer to a newly allocated wide character string, use
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Return NULL on decoding error or memory allocation error. If *size* is not
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
decoding error.
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
{ {
#if defined(__APPLE__) || defined(__ANDROID__)
wchar_t *wstr;
wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
if (size != NULL) {
if (wstr != NULL)
*size = wcslen(wstr);
else
*size = (size_t)-1;
}
return wstr;
#else
wchar_t *res; wchar_t *res;
size_t argsize; size_t argsize;
size_t count; size_t count;
@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
mbstate_t mbs; mbstate_t mbs;
#endif #endif
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
res = decode_ascii_surrogateescape(arg, size);
if (res == NULL)
goto oom;
return res;
}
#endif
#ifdef HAVE_BROKEN_MBSTOWCS #ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of /* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that * mbstowcs which does not count the characters that
@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
goto oom; goto oom;
#endif /* HAVE_MBRTOWC */ #endif /* HAVE_MBRTOWC */
return res; return res;
oom: oom:
if (size != NULL) if (size != NULL) {
*size = (size_t)-1; *size = (size_t)-1;
}
return NULL; return NULL;
}
/* Decode a byte string from the locale encoding with the
surrogateescape error handler: undecodable bytes are decoded as characters
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
Return a pointer to a newly allocated wide character string, use
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Return NULL on decoding error or memory allocation error. If *size* is not
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
decoding error.
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
{
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
#else
if (Py_UTF8Mode) {
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
}
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
if (wstr == NULL) {
if (size != NULL) {
*size = (size_t)-1;
}
return NULL;
}
return wstr;
}
#endif
return decode_locale(arg, size);
#endif /* __APPLE__ or __ANDROID__ */ #endif /* __APPLE__ or __ANDROID__ */
} }
/* Encode a wide character string to the locale encoding with the static char*
surrogateescape error handler: surrogate characters in the range _Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
the memory. Return NULL on encoding or memory allocation error.
If error_pos is not NULL, *error_pos is set to the index of the invalid
character on encoding error, or set to (size_t)-1 otherwise.
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
character string. */
char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
{ {
#if defined(__APPLE__) || defined(__ANDROID__)
Py_ssize_t len; Py_ssize_t len;
PyObject *unicode, *bytes = NULL; PyObject *unicode, *bytes = NULL;
char *cpath; char *cpath;
unicode = PyUnicode_FromWideChar(text, wcslen(text)); unicode = PyUnicode_FromWideChar(text, wcslen(text));
if (unicode == NULL) if (unicode == NULL) {
return NULL; return NULL;
}
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Py_DECREF(unicode); Py_DECREF(unicode);
if (bytes == NULL) { if (bytes == NULL) {
PyErr_Clear(); PyErr_Clear();
if (error_pos != NULL) if (error_pos != NULL) {
*error_pos = (size_t)-1; *error_pos = (size_t)-1;
}
return NULL; return NULL;
} }
@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
if (cpath == NULL) { if (cpath == NULL) {
PyErr_Clear(); PyErr_Clear();
Py_DECREF(bytes); Py_DECREF(bytes);
if (error_pos != NULL) if (error_pos != NULL) {
*error_pos = (size_t)-1; *error_pos = (size_t)-1;
}
return NULL; return NULL;
} }
memcpy(cpath, PyBytes_AsString(bytes), len + 1); memcpy(cpath, PyBytes_AsString(bytes), len + 1);
Py_DECREF(bytes); Py_DECREF(bytes);
return cpath; return cpath;
#else /* __APPLE__ */ }
static char*
encode_locale(const wchar_t *text, size_t *error_pos)
{
const size_t len = wcslen(text); const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL; char *result = NULL, *bytes = NULL;
size_t i, size, converted; size_t i, size, converted;
wchar_t c, buf[2]; wchar_t c, buf[2];
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos);
#endif
/* The function works in two steps: /* The function works in two steps:
1. compute the length of the output buffer in bytes (size) 1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */ 2. outputs the bytes */
@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
bytes = result; bytes = result;
} }
return result; return result;
}
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
the memory. Return NULL on encoding or memory allocation error.
If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
to the index of the invalid character on encoding error.
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
character string. */
char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
{
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeLocaleUTF8(text, error_pos);
#else /* __APPLE__ */
if (Py_UTF8Mode) {
return _Py_EncodeLocaleUTF8(text, error_pos);
}
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos);
#endif
return encode_locale(text, error_pos);
#endif /* __APPLE__ or __ANDROID__ */ #endif /* __APPLE__ or __ANDROID__ */
} }

View file

@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */
static _PyInitError add_main_module(PyInterpreterState *interp); static _PyInitError add_main_module(PyInterpreterState *interp);
static _PyInitError initfsencoding(PyInterpreterState *interp); static _PyInitError initfsencoding(PyInterpreterState *interp);
static _PyInitError initsite(void); static _PyInitError initsite(void);
static _PyInitError init_sys_streams(void); static _PyInitError init_sys_streams(PyInterpreterState *interp);
static _PyInitError initsigs(void); static _PyInitError initsigs(void);
static void call_py_exitfuncs(void); static void call_py_exitfuncs(void);
static void wait_for_thread_shutdown(void); static void wait_for_thread_shutdown(void);
@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config)
return err; return err;
} }
err = init_sys_streams(); err = init_sys_streams(interp);
if (_Py_INIT_FAILED(err)) { if (_Py_INIT_FAILED(err)) {
return err; return err;
} }
@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p)
return err; return err;
} }
err = init_sys_streams(); err = init_sys_streams(interp);
if (_Py_INIT_FAILED(err)) { if (_Py_INIT_FAILED(err)) {
return err; return err;
} }
@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp)
Py_FileSystemDefaultEncodeErrors = "surrogatepass"; Py_FileSystemDefaultEncodeErrors = "surrogatepass";
} }
#else #else
if (Py_FileSystemDefaultEncoding == NULL) { if (Py_FileSystemDefaultEncoding == NULL &&
interp->core_config.utf8_mode)
{
Py_FileSystemDefaultEncoding = "utf-8";
Py_HasFileSystemDefaultEncoding = 1;
}
else if (Py_FileSystemDefaultEncoding == NULL) {
Py_FileSystemDefaultEncoding = get_locale_encoding(); Py_FileSystemDefaultEncoding = get_locale_encoding();
if (Py_FileSystemDefaultEncoding == NULL) { if (Py_FileSystemDefaultEncoding == NULL) {
return _Py_INIT_ERR("Unable to get the locale encoding"); return _Py_INIT_ERR("Unable to get the locale encoding");
@ -1749,7 +1755,7 @@ error:
/* Initialize sys.stdin, stdout, stderr and builtins.open */ /* Initialize sys.stdin, stdout, stderr and builtins.open */
static _PyInitError static _PyInitError
init_sys_streams(void) init_sys_streams(PyInterpreterState *interp)
{ {
PyObject *iomod = NULL, *wrapper; PyObject *iomod = NULL, *wrapper;
PyObject *bimod = NULL; PyObject *bimod = NULL;
@ -1794,10 +1800,10 @@ init_sys_streams(void)
encoding = _Py_StandardStreamEncoding; encoding = _Py_StandardStreamEncoding;
errors = _Py_StandardStreamErrors; errors = _Py_StandardStreamErrors;
if (!encoding || !errors) { if (!encoding || !errors) {
pythonioencoding = Py_GETENV("PYTHONIOENCODING"); char *opt = Py_GETENV("PYTHONIOENCODING");
if (pythonioencoding) { if (opt && opt[0] != '\0') {
char *err; char *err;
pythonioencoding = _PyMem_Strdup(pythonioencoding); pythonioencoding = _PyMem_Strdup(opt);
if (pythonioencoding == NULL) { if (pythonioencoding == NULL) {
PyErr_NoMemory(); PyErr_NoMemory();
goto error; goto error;
@ -1814,7 +1820,12 @@ init_sys_streams(void)
encoding = pythonioencoding; encoding = pythonioencoding;
} }
} }
if (!errors && !(pythonioencoding && *pythonioencoding)) { else if (interp->core_config.utf8_mode) {
encoding = "utf-8";
errors = "surrogateescape";
}
if (!errors && !pythonioencoding) {
/* Choose the default error handler based on the current locale */ /* Choose the default error handler based on the current locale */
errors = get_default_standard_stream_error_handler(); errors = get_default_standard_stream_error_handler();
} }

View file

@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = {
{"hash_randomization", "-R"}, {"hash_randomization", "-R"},
{"isolated", "-I"}, {"isolated", "-I"},
{"dev_mode", "-X dev"}, {"dev_mode", "-X dev"},
{"utf8_mode", "-X utf8"},
{0} {0}
}; };
@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = {
"sys.flags", /* name */ "sys.flags", /* name */
flags__doc__, /* doc */ flags__doc__, /* doc */
flags_fields, /* fields */ flags_fields, /* fields */
14 15
}; };
static PyObject* static PyObject*
@ -1853,8 +1854,9 @@ make_flags(void)
SetFlag(Py_QuietFlag); SetFlag(Py_QuietFlag);
SetFlag(Py_HashRandomizationFlag); SetFlag(Py_HashRandomizationFlag);
SetFlag(Py_IsolatedFlag); SetFlag(Py_IsolatedFlag);
#undef SetFlag
PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode)); PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode));
SetFlag(Py_UTF8Mode);
#undef SetFlag
if (PyErr_Occurred()) { if (PyErr_Occurred()) {
Py_DECREF(seq); Py_DECREF(seq);