mirror of
https://github.com/python/cpython.git
synced 2025-07-24 11:44:31 +00:00
GH-133711: Enable UTF-8 mode by default (PEP 686) (#133712)
Co-authored-by: Victor Stinner <vstinner@python.org>
This commit is contained in:
parent
f320c951c3
commit
c89a66feb1
14 changed files with 93 additions and 85 deletions
|
@ -975,9 +975,7 @@ PyPreConfig
|
|||
Set to ``0`` or ``1`` by the :option:`-X utf8 <-X>` command line option
|
||||
and the :envvar:`PYTHONUTF8` environment variable.
|
||||
|
||||
Also set to ``1`` if the ``LC_CTYPE`` locale is ``C`` or ``POSIX``.
|
||||
|
||||
Default: ``-1`` in Python config and ``0`` in isolated config.
|
||||
Default: ``1``.
|
||||
|
||||
|
||||
.. _c-preinit:
|
||||
|
|
|
@ -108,6 +108,12 @@ Python UTF-8 Mode
|
|||
.. versionadded:: 3.7
|
||||
See :pep:`540` for more details.
|
||||
|
||||
.. versionchanged:: next
|
||||
|
||||
Python UTF-8 mode is now enabled by default (:pep:`686`).
|
||||
It may be disabled with by setting :envvar:`PYTHONUTF8=0 <PYTHONUTF8>` as
|
||||
an environment variable or by using the :option:`-X utf8=0 <-X>` command line option.
|
||||
|
||||
The Python UTF-8 Mode ignores the :term:`locale encoding` and forces the usage
|
||||
of the UTF-8 encoding:
|
||||
|
||||
|
@ -139,31 +145,22 @@ level APIs also exhibit different default behaviours:
|
|||
default so that attempting to open a binary file in text mode is likely
|
||||
to raise an exception rather than producing nonsense data.
|
||||
|
||||
The :ref:`Python UTF-8 Mode <utf8-mode>` is enabled if the LC_CTYPE locale is
|
||||
``C`` or ``POSIX`` at Python startup (see the :c:func:`PyConfig_Read`
|
||||
function).
|
||||
|
||||
It can be enabled or disabled using the :option:`-X utf8 <-X>` command line
|
||||
option and the :envvar:`PYTHONUTF8` environment variable.
|
||||
|
||||
If the :envvar:`PYTHONUTF8` environment variable is not set at all, then the
|
||||
interpreter defaults to using the current locale settings, *unless* the current
|
||||
locale is identified as a legacy ASCII-based locale (as described for
|
||||
:envvar:`PYTHONCOERCECLOCALE`), and locale coercion is either disabled or
|
||||
fails. In such legacy locales, the interpreter will default to enabling UTF-8
|
||||
mode unless explicitly instructed not to do so.
|
||||
|
||||
The Python UTF-8 Mode can only be enabled at the Python startup. Its value
|
||||
The :ref:`Python UTF-8 Mode <utf8-mode>` is enabled by default.
|
||||
It can be disabled using the :option:`-X utf8=0 <-X>` command line
|
||||
option or the :envvar:`PYTHONUTF8=0 <PYTHONUTF8>` environment variable.
|
||||
The Python UTF-8 Mode can only be disabled at Python startup. Its value
|
||||
can be read from :data:`sys.flags.utf8_mode <sys.flags>`.
|
||||
|
||||
If the UTF-8 mode is disabled, the interpreter defaults to using
|
||||
the current locale settings, *unless* the current locale is identified
|
||||
as a legacy ASCII-based locale (as described for :envvar:`PYTHONCOERCECLOCALE`),
|
||||
and locale coercion is either disabled or fails.
|
||||
In such legacy locales, the interpreter will default to enabling UTF-8 mode
|
||||
unless explicitly instructed not to do so.
|
||||
|
||||
See also the :ref:`UTF-8 mode on Windows <win-utf8-mode>`
|
||||
and the :term:`filesystem encoding and error handler`.
|
||||
|
||||
.. seealso::
|
||||
|
||||
:pep:`686`
|
||||
Python 3.15 will make :ref:`utf8-mode` default.
|
||||
|
||||
|
||||
.. _os-procinfo:
|
||||
|
||||
|
|
|
@ -1006,6 +1006,9 @@ UTF-8 mode
|
|||
==========
|
||||
|
||||
.. versionadded:: 3.7
|
||||
.. versionchanged:: next
|
||||
|
||||
Python UTF-8 mode is now enabled by default (:pep:`686`).
|
||||
|
||||
Windows still uses legacy encodings for the system encoding (the ANSI Code
|
||||
Page). Python uses it for the default encoding of text files (e.g.
|
||||
|
@ -1014,20 +1017,22 @@ Page). Python uses it for the default encoding of text files (e.g.
|
|||
This may cause issues because UTF-8 is widely used on the internet
|
||||
and most Unix systems, including WSL (Windows Subsystem for Linux).
|
||||
|
||||
You can use the :ref:`Python UTF-8 Mode <utf8-mode>` to change the default text
|
||||
encoding to UTF-8. You can enable the :ref:`Python UTF-8 Mode <utf8-mode>` via
|
||||
the ``-X utf8`` command line option, or the ``PYTHONUTF8=1`` environment
|
||||
variable. See :envvar:`PYTHONUTF8` for enabling UTF-8 mode, and
|
||||
:ref:`setting-envvars` for how to modify environment variables.
|
||||
|
||||
When the :ref:`Python UTF-8 Mode <utf8-mode>` is enabled, you can still use the
|
||||
The :ref:`Python UTF-8 Mode <utf8-mode>`, enabled by default, can help by
|
||||
changing the default text encoding to UTF-8.
|
||||
When the :ref:`UTF-8 mode <utf8-mode>` is enabled, you can still use the
|
||||
system encoding (the ANSI Code Page) via the "mbcs" codec.
|
||||
|
||||
Note that adding ``PYTHONUTF8=1`` to the default environment variables
|
||||
will affect all Python 3.7+ applications on your system.
|
||||
If you have any Python 3.7+ applications which rely on the legacy
|
||||
system encoding, it is recommended to set the environment variable
|
||||
temporarily or use the ``-X utf8`` command line option.
|
||||
You can disable the :ref:`Python UTF-8 Mode <utf8-mode>` via
|
||||
the ``-X utf8=0`` command line option, or the ``PYTHONUTF8=0`` environment
|
||||
variable. See :envvar:`PYTHONUTF8` for disabling UTF-8 mode, and
|
||||
:ref:`setting-envvars` for how to modify environment variables.
|
||||
|
||||
.. hint::
|
||||
Adding ``PYTHONUTF8={0,1}`` to the default environment variables
|
||||
will affect all Python 3.7+ applications on your system.
|
||||
If you have any Python 3.7+ applications which rely on the legacy
|
||||
system encoding, it is recommended to set the environment variable
|
||||
temporarily or use the ``-X utf8`` command line option.
|
||||
|
||||
.. note::
|
||||
Even when UTF-8 mode is disabled, Python uses UTF-8 by default
|
||||
|
|
|
@ -172,11 +172,35 @@ production systems where traditional profiling approaches would be too intrusive
|
|||
Other language changes
|
||||
======================
|
||||
|
||||
* Python now uses UTF-8_ as the default encoding, independent of the system's
|
||||
environment. This means that I/O operations without an explicit encoding,
|
||||
e.g. ``open('flying-circus.txt')``, will use UTF-8.
|
||||
UTF-8 is a widely-supported Unicode_ character encoding that has become a
|
||||
*de facto* standard for representing text, including nearly every webpage
|
||||
on the internet, many common file formats, programming languages, and more.
|
||||
|
||||
This only applies when no ``encoding`` argument is given. For best
|
||||
compatibility between versions of Python, ensure that an explicit ``encoding``
|
||||
argument is always provided. The :ref:`opt-in encoding warning <io-encoding-warning>`
|
||||
can be used to identify code that may be affected by this change.
|
||||
The special special ``encoding='locale'`` argument uses the current locale
|
||||
encoding, and has been supported since Python 3.10.
|
||||
|
||||
To retain the previous behaviour, Python's UTF-8 mode may be disabled with
|
||||
the :envvar:`PYTHONUTF8=0 <PYTHONUTF8>` environment variable or the
|
||||
:option:`-X utf8=0 <-X>` command line option.
|
||||
|
||||
.. seealso:: :pep:`686` for further details.
|
||||
|
||||
.. _UTF-8: https://en.wikipedia.org/wiki/UTF-8
|
||||
.. _Unicode: https://home.unicode.org/
|
||||
|
||||
(Contributed by Adam Turner in :gh:`133711`; PEP 686 written by Inada Naoki.)
|
||||
|
||||
* Several error messages incorrectly using the term "argument" have been corrected.
|
||||
(Contributed by Stan Ulbrych in :gh:`133382`.)
|
||||
|
||||
|
||||
|
||||
New modules
|
||||
===========
|
||||
|
||||
|
|
|
@ -102,15 +102,14 @@ typedef struct PyPreConfig {
|
|||
|
||||
/* Enable UTF-8 mode? (PEP 540)
|
||||
|
||||
Disabled by default (equals to 0).
|
||||
If equal to 1, use the UTF-8 encoding and use "surrogateescape" for the
|
||||
stdin & stdout error handlers.
|
||||
|
||||
Set to 1 by "-X utf8" and "-X utf8=1" command line options.
|
||||
Set to 1 by PYTHONUTF8=1 environment variable.
|
||||
Enabled by default (equal to 1; PEP 686), or if Py_UTF8Mode=1,
|
||||
or if "-X utf8=1" or PYTHONUTF8=1.
|
||||
|
||||
Set to 0 by "-X utf8=0" and PYTHONUTF8=0.
|
||||
|
||||
If equals to -1, it is set to 1 if the LC_CTYPE locale is "C" or
|
||||
"POSIX", otherwise it is set to 0. Inherit Py_UTF8Mode value value. */
|
||||
Set to 0 by "-X utf8=0" or PYTHONUTF8=0.
|
||||
*/
|
||||
int utf8_mode;
|
||||
|
||||
/* If non-zero, enable the Python Development Mode.
|
||||
|
|
|
@ -651,7 +651,8 @@ else:
|
|||
if sys.flags.warn_default_encoding:
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"UTF-8 Mode affects locale.getpreferredencoding(). Consider locale.getencoding() instead.",
|
||||
"UTF-8 Mode affects locale.getpreferredencoding(). "
|
||||
"Consider locale.getencoding() instead.",
|
||||
EncodingWarning, 2)
|
||||
if sys.flags.utf8_mode:
|
||||
return 'utf-8'
|
||||
|
|
|
@ -380,8 +380,7 @@ def _text_encoding():
|
|||
|
||||
if sys.flags.utf8_mode:
|
||||
return "utf-8"
|
||||
else:
|
||||
return locale.getencoding()
|
||||
return locale.getencoding()
|
||||
|
||||
|
||||
def call(*popenargs, timeout=None, **kwargs):
|
||||
|
|
|
@ -300,6 +300,10 @@ class CmdLineTest(unittest.TestCase):
|
|||
cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
|
||||
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
|
||||
|
||||
def run_no_utf8_mode(arg):
|
||||
cmd = [sys.executable, '-X', 'utf8=0', '-c', code, arg]
|
||||
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
|
||||
|
||||
valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
|
||||
# invalid UTF-8 byte sequences with a valid UTF-8 sequence
|
||||
# in the middle.
|
||||
|
@ -312,7 +316,8 @@ class CmdLineTest(unittest.TestCase):
|
|||
)
|
||||
test_args = [valid_utf8, invalid_utf8]
|
||||
|
||||
for run_cmd in (run_default, run_c_locale, run_utf8_mode):
|
||||
for run_cmd in (run_default, run_c_locale, run_utf8_mode,
|
||||
run_no_utf8_mode):
|
||||
with self.subTest(run_cmd=run_cmd):
|
||||
for arg in test_args:
|
||||
proc = run_cmd(arg)
|
||||
|
|
|
@ -543,7 +543,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
|
|||
'configure_locale': True,
|
||||
'coerce_c_locale': False,
|
||||
'coerce_c_locale_warn': False,
|
||||
'utf8_mode': False,
|
||||
'utf8_mode': True,
|
||||
}
|
||||
if MS_WINDOWS:
|
||||
PRE_CONFIG_COMPAT.update({
|
||||
|
@ -560,7 +560,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
|
|||
configure_locale=False,
|
||||
isolated=True,
|
||||
use_environment=False,
|
||||
utf8_mode=False,
|
||||
utf8_mode=True,
|
||||
dev_mode=False,
|
||||
coerce_c_locale=False,
|
||||
)
|
||||
|
@ -805,12 +805,6 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
|
|||
'stdio_encoding', 'stdio_errors'):
|
||||
expected[key] = self.IGNORE_CONFIG
|
||||
|
||||
if not expected_preconfig['configure_locale']:
|
||||
# UTF-8 Mode depends on the locale. There is no easy way
|
||||
# to guess if UTF-8 Mode will be enabled or not if the locale
|
||||
# is not configured.
|
||||
expected_preconfig['utf8_mode'] = self.IGNORE_CONFIG
|
||||
|
||||
if expected_preconfig['utf8_mode'] == 1:
|
||||
if expected['filesystem_encoding'] is self.GET_DEFAULT_CONFIG:
|
||||
expected['filesystem_encoding'] = 'utf-8'
|
||||
|
|
|
@ -89,8 +89,8 @@ class UTF8ModeTests(unittest.TestCase):
|
|||
# the UTF-8 mode
|
||||
if not self.posix_locale():
|
||||
# PYTHONUTF8 should be ignored if -E is used
|
||||
out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
|
||||
self.assertEqual(out, '0')
|
||||
out = self.get_output('-E', '-c', code, PYTHONUTF8='0')
|
||||
self.assertEqual(out, '1')
|
||||
|
||||
# invalid mode
|
||||
out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
|
||||
|
@ -116,7 +116,7 @@ class UTF8ModeTests(unittest.TestCase):
|
|||
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
|
||||
# and has the priority over -X utf8 and PYTHONUTF8
|
||||
out = self.get_output('-X', 'utf8', '-c', code,
|
||||
PYTHONUTF8='strict',
|
||||
PYTHONUTF8='xxx',
|
||||
PYTHONLEGACYWINDOWSFSENCODING='1')
|
||||
self.assertEqual(out, 'mbcs/replace')
|
||||
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Implement :pep:`686`: Enable :ref:`Python UTF-8 Mode <utf8-mode>` by
|
||||
default. Patch by Adam Turner.
|
|
@ -1854,9 +1854,9 @@ static int test_initconfig_get_api(void)
|
|||
assert(initconfig_getint(config, "dev_mode") == 1);
|
||||
|
||||
// test PyInitConfig_GetInt() on a PyPreConfig option
|
||||
assert(initconfig_getint(config, "utf8_mode") == 0);
|
||||
assert(PyInitConfig_SetInt(config, "utf8_mode", 1) == 0);
|
||||
assert(initconfig_getint(config, "utf8_mode") == 1);
|
||||
assert(PyInitConfig_SetInt(config, "utf8_mode", 0) == 0);
|
||||
assert(initconfig_getint(config, "utf8_mode") == 0);
|
||||
|
||||
// test PyInitConfig_GetStr()
|
||||
char *str;
|
||||
|
|
|
@ -459,7 +459,7 @@ static const char usage_envvars[] =
|
|||
|
||||
/* --- Global configuration variables ----------------------------- */
|
||||
|
||||
/* UTF-8 mode (PEP 540): if equals to 1, use the UTF-8 encoding, and change
|
||||
/* UTF-8 mode (PEP 540): if equal to 1, use the UTF-8 encoding, and change
|
||||
stdin and stdout error handler to "surrogateescape". */
|
||||
int Py_UTF8Mode = 0;
|
||||
int Py_DebugFlag = 0; /* Needed by parser.c */
|
||||
|
|
|
@ -291,12 +291,12 @@ _PyPreConfig_InitCompatConfig(PyPreConfig *config)
|
|||
config->use_environment = -1;
|
||||
config->configure_locale = 1;
|
||||
|
||||
/* bpo-36443: C locale coercion (PEP 538) and UTF-8 Mode (PEP 540)
|
||||
are disabled by default using the Compat configuration.
|
||||
/* gh-80624: C locale coercion (PEP 538) is disabled by default using
|
||||
the Compat configuration.
|
||||
|
||||
Py_UTF8Mode=1 enables the UTF-8 mode. PYTHONUTF8 environment variable
|
||||
Py_UTF8Mode=0 disables the UTF-8 mode. PYTHONUTF8 environment variable
|
||||
is ignored (even if use_environment=1). */
|
||||
config->utf8_mode = 0;
|
||||
config->utf8_mode = 1;
|
||||
config->coerce_c_locale = 0;
|
||||
config->coerce_c_locale_warn = 0;
|
||||
|
||||
|
@ -317,8 +317,8 @@ PyPreConfig_InitPythonConfig(PyPreConfig *config)
|
|||
config->isolated = 0;
|
||||
config->parse_argv = 1;
|
||||
config->use_environment = 1;
|
||||
/* Set to -1 to enable C locale coercion (PEP 538) and UTF-8 Mode (PEP 540)
|
||||
depending on the LC_CTYPE locale, PYTHONUTF8 and PYTHONCOERCECLOCALE
|
||||
/* Set to -1 to enable C locale coercion (PEP 538) depending on
|
||||
the LC_CTYPE locale, PYTHONUTF8 and PYTHONCOERCECLOCALE
|
||||
environment variables. */
|
||||
config->coerce_c_locale = -1;
|
||||
config->coerce_c_locale_warn = -1;
|
||||
|
@ -338,7 +338,7 @@ PyPreConfig_InitIsolatedConfig(PyPreConfig *config)
|
|||
config->configure_locale = 0;
|
||||
config->isolated = 1;
|
||||
config->use_environment = 0;
|
||||
config->utf8_mode = 0;
|
||||
config->utf8_mode = 1;
|
||||
config->dev_mode = 0;
|
||||
#ifdef MS_WINDOWS
|
||||
config->legacy_windows_fs_encoding = 0;
|
||||
|
@ -649,23 +649,7 @@ preconfig_init_utf8_mode(PyPreConfig *config, const _PyPreCmdline *cmdline)
|
|||
return _PyStatus_OK();
|
||||
}
|
||||
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (config->utf8_mode < 0) {
|
||||
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
|
||||
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
|
||||
if (ctype_loc != NULL
|
||||
&& (strcmp(ctype_loc, "C") == 0
|
||||
|| strcmp(ctype_loc, "POSIX") == 0))
|
||||
{
|
||||
config->utf8_mode = 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (config->utf8_mode < 0) {
|
||||
config->utf8_mode = 0;
|
||||
}
|
||||
config->utf8_mode = 1;
|
||||
return _PyStatus_OK();
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue