From c89a66feb12110e68e63a6293e3ed9c9fd180412 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Date: Tue, 15 Jul 2025 10:45:41 +0100 Subject: [PATCH] GH-133711: Enable UTF-8 mode by default (PEP 686) (#133712) Co-authored-by: Victor Stinner --- Doc/c-api/init_config.rst | 4 +- Doc/library/os.rst | 37 +++++++++---------- Doc/using/windows.rst | 29 +++++++++------ Doc/whatsnew/3.15.rst | 26 ++++++++++++- Include/cpython/initconfig.h | 13 +++---- Lib/locale.py | 3 +- Lib/subprocess.py | 3 +- Lib/test/test_cmd_line.py | 7 +++- Lib/test/test_embed.py | 10 +---- Lib/test/test_utf8_mode.py | 6 +-- ...-05-08-22-19-10.gh-issue-133711.e91wUy.rst | 2 + Programs/_testembed.c | 4 +- Python/initconfig.c | 2 +- Python/preconfig.c | 32 ++++------------ 14 files changed, 93 insertions(+), 85 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst index 4fd10224262..24be9ead387 100644 --- a/Doc/c-api/init_config.rst +++ b/Doc/c-api/init_config.rst @@ -975,9 +975,7 @@ PyPreConfig Set to ``0`` or ``1`` by the :option:`-X utf8 <-X>` command line option and the :envvar:`PYTHONUTF8` environment variable. - Also set to ``1`` if the ``LC_CTYPE`` locale is ``C`` or ``POSIX``. - - Default: ``-1`` in Python config and ``0`` in isolated config. + Default: ``1``. .. _c-preinit: diff --git a/Doc/library/os.rst b/Doc/library/os.rst index 1e54cfec609..45ec6c7a51b 100644 --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -108,6 +108,12 @@ Python UTF-8 Mode .. versionadded:: 3.7 See :pep:`540` for more details. +.. versionchanged:: next + + Python UTF-8 mode is now enabled by default (:pep:`686`). + It may be disabled with by setting :envvar:`PYTHONUTF8=0 ` as + an environment variable or by using the :option:`-X utf8=0 <-X>` command line option. + The Python UTF-8 Mode ignores the :term:`locale encoding` and forces the usage of the UTF-8 encoding: @@ -139,31 +145,22 @@ level APIs also exhibit different default behaviours: default so that attempting to open a binary file in text mode is likely to raise an exception rather than producing nonsense data. -The :ref:`Python UTF-8 Mode ` is enabled if the LC_CTYPE locale is -``C`` or ``POSIX`` at Python startup (see the :c:func:`PyConfig_Read` -function). - -It can be enabled or disabled using the :option:`-X utf8 <-X>` command line -option and the :envvar:`PYTHONUTF8` environment variable. - -If the :envvar:`PYTHONUTF8` environment variable is not set at all, then the -interpreter defaults to using the current locale settings, *unless* the current -locale is identified as a legacy ASCII-based locale (as described for -:envvar:`PYTHONCOERCECLOCALE`), and locale coercion is either disabled or -fails. In such legacy locales, the interpreter will default to enabling UTF-8 -mode unless explicitly instructed not to do so. - -The Python UTF-8 Mode can only be enabled at the Python startup. Its value +The :ref:`Python UTF-8 Mode ` is enabled by default. +It can be disabled using the :option:`-X utf8=0 <-X>` command line +option or the :envvar:`PYTHONUTF8=0 ` environment variable. +The Python UTF-8 Mode can only be disabled at Python startup. Its value can be read from :data:`sys.flags.utf8_mode `. +If the UTF-8 mode is disabled, the interpreter defaults to using +the current locale settings, *unless* the current locale is identified +as a legacy ASCII-based locale (as described for :envvar:`PYTHONCOERCECLOCALE`), +and locale coercion is either disabled or fails. +In such legacy locales, the interpreter will default to enabling UTF-8 mode +unless explicitly instructed not to do so. + See also the :ref:`UTF-8 mode on Windows ` and the :term:`filesystem encoding and error handler`. -.. seealso:: - - :pep:`686` - Python 3.15 will make :ref:`utf8-mode` default. - .. _os-procinfo: diff --git a/Doc/using/windows.rst b/Doc/using/windows.rst index 9628da3d2f6..7cc50bccb37 100644 --- a/Doc/using/windows.rst +++ b/Doc/using/windows.rst @@ -1006,6 +1006,9 @@ UTF-8 mode ========== .. versionadded:: 3.7 +.. versionchanged:: next + + Python UTF-8 mode is now enabled by default (:pep:`686`). Windows still uses legacy encodings for the system encoding (the ANSI Code Page). Python uses it for the default encoding of text files (e.g. @@ -1014,20 +1017,22 @@ Page). Python uses it for the default encoding of text files (e.g. This may cause issues because UTF-8 is widely used on the internet and most Unix systems, including WSL (Windows Subsystem for Linux). -You can use the :ref:`Python UTF-8 Mode ` to change the default text -encoding to UTF-8. You can enable the :ref:`Python UTF-8 Mode ` via -the ``-X utf8`` command line option, or the ``PYTHONUTF8=1`` environment -variable. See :envvar:`PYTHONUTF8` for enabling UTF-8 mode, and -:ref:`setting-envvars` for how to modify environment variables. - -When the :ref:`Python UTF-8 Mode ` is enabled, you can still use the +The :ref:`Python UTF-8 Mode `, enabled by default, can help by +changing the default text encoding to UTF-8. +When the :ref:`UTF-8 mode ` is enabled, you can still use the system encoding (the ANSI Code Page) via the "mbcs" codec. -Note that adding ``PYTHONUTF8=1`` to the default environment variables -will affect all Python 3.7+ applications on your system. -If you have any Python 3.7+ applications which rely on the legacy -system encoding, it is recommended to set the environment variable -temporarily or use the ``-X utf8`` command line option. +You can disable the :ref:`Python UTF-8 Mode ` via +the ``-X utf8=0`` command line option, or the ``PYTHONUTF8=0`` environment +variable. See :envvar:`PYTHONUTF8` for disabling UTF-8 mode, and +:ref:`setting-envvars` for how to modify environment variables. + +.. hint:: + Adding ``PYTHONUTF8={0,1}`` to the default environment variables + will affect all Python 3.7+ applications on your system. + If you have any Python 3.7+ applications which rely on the legacy + system encoding, it is recommended to set the environment variable + temporarily or use the ``-X utf8`` command line option. .. note:: Even when UTF-8 mode is disabled, Python uses UTF-8 by default diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index dd0bb6bd5b8..fe3d45b83a5 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -172,11 +172,35 @@ production systems where traditional profiling approaches would be too intrusive Other language changes ====================== +* Python now uses UTF-8_ as the default encoding, independent of the system's + environment. This means that I/O operations without an explicit encoding, + e.g. ``open('flying-circus.txt')``, will use UTF-8. + UTF-8 is a widely-supported Unicode_ character encoding that has become a + *de facto* standard for representing text, including nearly every webpage + on the internet, many common file formats, programming languages, and more. + + This only applies when no ``encoding`` argument is given. For best + compatibility between versions of Python, ensure that an explicit ``encoding`` + argument is always provided. The :ref:`opt-in encoding warning ` + can be used to identify code that may be affected by this change. + The special special ``encoding='locale'`` argument uses the current locale + encoding, and has been supported since Python 3.10. + + To retain the previous behaviour, Python's UTF-8 mode may be disabled with + the :envvar:`PYTHONUTF8=0 ` environment variable or the + :option:`-X utf8=0 <-X>` command line option. + + .. seealso:: :pep:`686` for further details. + + .. _UTF-8: https://en.wikipedia.org/wiki/UTF-8 + .. _Unicode: https://home.unicode.org/ + + (Contributed by Adam Turner in :gh:`133711`; PEP 686 written by Inada Naoki.) + * Several error messages incorrectly using the term "argument" have been corrected. (Contributed by Stan Ulbrych in :gh:`133382`.) - New modules =========== diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index 7ce4acfeb71..1c979d91a40 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -102,15 +102,14 @@ typedef struct PyPreConfig { /* Enable UTF-8 mode? (PEP 540) - Disabled by default (equals to 0). + If equal to 1, use the UTF-8 encoding and use "surrogateescape" for the + stdin & stdout error handlers. - Set to 1 by "-X utf8" and "-X utf8=1" command line options. - Set to 1 by PYTHONUTF8=1 environment variable. + Enabled by default (equal to 1; PEP 686), or if Py_UTF8Mode=1, + or if "-X utf8=1" or PYTHONUTF8=1. - Set to 0 by "-X utf8=0" and PYTHONUTF8=0. - - If equals to -1, it is set to 1 if the LC_CTYPE locale is "C" or - "POSIX", otherwise it is set to 0. Inherit Py_UTF8Mode value value. */ + Set to 0 by "-X utf8=0" or PYTHONUTF8=0. + */ int utf8_mode; /* If non-zero, enable the Python Development Mode. diff --git a/Lib/locale.py b/Lib/locale.py index dfedc6386cb..0bde7ed51c6 100644 --- a/Lib/locale.py +++ b/Lib/locale.py @@ -651,7 +651,8 @@ else: if sys.flags.warn_default_encoding: import warnings warnings.warn( - "UTF-8 Mode affects locale.getpreferredencoding(). Consider locale.getencoding() instead.", + "UTF-8 Mode affects locale.getpreferredencoding(). " + "Consider locale.getencoding() instead.", EncodingWarning, 2) if sys.flags.utf8_mode: return 'utf-8' diff --git a/Lib/subprocess.py b/Lib/subprocess.py index 54c2eb515b6..79251bd5310 100644 --- a/Lib/subprocess.py +++ b/Lib/subprocess.py @@ -380,8 +380,7 @@ def _text_encoding(): if sys.flags.utf8_mode: return "utf-8" - else: - return locale.getencoding() + return locale.getencoding() def call(*popenargs, timeout=None, **kwargs): diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index c17d749d4a1..f30a1874ab9 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -300,6 +300,10 @@ class CmdLineTest(unittest.TestCase): cmd = [sys.executable, '-X', 'utf8', '-c', code, arg] return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + def run_no_utf8_mode(arg): + cmd = [sys.executable, '-X', 'utf8=0', '-c', code, arg] + return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') # invalid UTF-8 byte sequences with a valid UTF-8 sequence # in the middle. @@ -312,7 +316,8 @@ class CmdLineTest(unittest.TestCase): ) test_args = [valid_utf8, invalid_utf8] - for run_cmd in (run_default, run_c_locale, run_utf8_mode): + for run_cmd in (run_default, run_c_locale, run_utf8_mode, + run_no_utf8_mode): with self.subTest(run_cmd=run_cmd): for arg in test_args: proc = run_cmd(arg) diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 89f4aebe28f..22dfdb6bb6f 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -543,7 +543,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'configure_locale': True, 'coerce_c_locale': False, 'coerce_c_locale_warn': False, - 'utf8_mode': False, + 'utf8_mode': True, } if MS_WINDOWS: PRE_CONFIG_COMPAT.update({ @@ -560,7 +560,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): configure_locale=False, isolated=True, use_environment=False, - utf8_mode=False, + utf8_mode=True, dev_mode=False, coerce_c_locale=False, ) @@ -805,12 +805,6 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'stdio_encoding', 'stdio_errors'): expected[key] = self.IGNORE_CONFIG - if not expected_preconfig['configure_locale']: - # UTF-8 Mode depends on the locale. There is no easy way - # to guess if UTF-8 Mode will be enabled or not if the locale - # is not configured. - expected_preconfig['utf8_mode'] = self.IGNORE_CONFIG - if expected_preconfig['utf8_mode'] == 1: if expected['filesystem_encoding'] is self.GET_DEFAULT_CONFIG: expected['filesystem_encoding'] = 'utf-8' diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py index f66881044e1..b8e49440c9f 100644 --- a/Lib/test/test_utf8_mode.py +++ b/Lib/test/test_utf8_mode.py @@ -89,8 +89,8 @@ class UTF8ModeTests(unittest.TestCase): # the UTF-8 mode if not self.posix_locale(): # PYTHONUTF8 should be ignored if -E is used - out = self.get_output('-E', '-c', code, PYTHONUTF8='1') - self.assertEqual(out, '0') + out = self.get_output('-E', '-c', code, PYTHONUTF8='0') + self.assertEqual(out, '1') # invalid mode out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True) @@ -116,7 +116,7 @@ class UTF8ModeTests(unittest.TestCase): # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode # and has the priority over -X utf8 and PYTHONUTF8 out = self.get_output('-X', 'utf8', '-c', code, - PYTHONUTF8='strict', + PYTHONUTF8='xxx', PYTHONLEGACYWINDOWSFSENCODING='1') self.assertEqual(out, 'mbcs/replace') diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst new file mode 100644 index 00000000000..c8d3d62763d --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst @@ -0,0 +1,2 @@ +Implement :pep:`686`: Enable :ref:`Python UTF-8 Mode ` by +default. Patch by Adam Turner. diff --git a/Programs/_testembed.c b/Programs/_testembed.c index 577da65c7cd..88936bbc699 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -1854,9 +1854,9 @@ static int test_initconfig_get_api(void) assert(initconfig_getint(config, "dev_mode") == 1); // test PyInitConfig_GetInt() on a PyPreConfig option - assert(initconfig_getint(config, "utf8_mode") == 0); - assert(PyInitConfig_SetInt(config, "utf8_mode", 1) == 0); assert(initconfig_getint(config, "utf8_mode") == 1); + assert(PyInitConfig_SetInt(config, "utf8_mode", 0) == 0); + assert(initconfig_getint(config, "utf8_mode") == 0); // test PyInitConfig_GetStr() char *str; diff --git a/Python/initconfig.c b/Python/initconfig.c index 73a9a9bf1ca..cc0db19d416 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -459,7 +459,7 @@ static const char usage_envvars[] = /* --- Global configuration variables ----------------------------- */ -/* UTF-8 mode (PEP 540): if equals to 1, use the UTF-8 encoding, and change +/* UTF-8 mode (PEP 540): if equal to 1, use the UTF-8 encoding, and change stdin and stdout error handler to "surrogateescape". */ int Py_UTF8Mode = 0; int Py_DebugFlag = 0; /* Needed by parser.c */ diff --git a/Python/preconfig.c b/Python/preconfig.c index 67b2d2f2dc1..e4cd10d9e3d 100644 --- a/Python/preconfig.c +++ b/Python/preconfig.c @@ -291,12 +291,12 @@ _PyPreConfig_InitCompatConfig(PyPreConfig *config) config->use_environment = -1; config->configure_locale = 1; - /* bpo-36443: C locale coercion (PEP 538) and UTF-8 Mode (PEP 540) - are disabled by default using the Compat configuration. + /* gh-80624: C locale coercion (PEP 538) is disabled by default using + the Compat configuration. - Py_UTF8Mode=1 enables the UTF-8 mode. PYTHONUTF8 environment variable + Py_UTF8Mode=0 disables the UTF-8 mode. PYTHONUTF8 environment variable is ignored (even if use_environment=1). */ - config->utf8_mode = 0; + config->utf8_mode = 1; config->coerce_c_locale = 0; config->coerce_c_locale_warn = 0; @@ -317,8 +317,8 @@ PyPreConfig_InitPythonConfig(PyPreConfig *config) config->isolated = 0; config->parse_argv = 1; config->use_environment = 1; - /* Set to -1 to enable C locale coercion (PEP 538) and UTF-8 Mode (PEP 540) - depending on the LC_CTYPE locale, PYTHONUTF8 and PYTHONCOERCECLOCALE + /* Set to -1 to enable C locale coercion (PEP 538) depending on + the LC_CTYPE locale, PYTHONUTF8 and PYTHONCOERCECLOCALE environment variables. */ config->coerce_c_locale = -1; config->coerce_c_locale_warn = -1; @@ -338,7 +338,7 @@ PyPreConfig_InitIsolatedConfig(PyPreConfig *config) config->configure_locale = 0; config->isolated = 1; config->use_environment = 0; - config->utf8_mode = 0; + config->utf8_mode = 1; config->dev_mode = 0; #ifdef MS_WINDOWS config->legacy_windows_fs_encoding = 0; @@ -649,23 +649,7 @@ preconfig_init_utf8_mode(PyPreConfig *config, const _PyPreCmdline *cmdline) return _PyStatus_OK(); } - -#ifndef MS_WINDOWS - if (config->utf8_mode < 0) { - /* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */ - const char *ctype_loc = setlocale(LC_CTYPE, NULL); - if (ctype_loc != NULL - && (strcmp(ctype_loc, "C") == 0 - || strcmp(ctype_loc, "POSIX") == 0)) - { - config->utf8_mode = 1; - } - } -#endif - - if (config->utf8_mode < 0) { - config->utf8_mode = 0; - } + config->utf8_mode = 1; return _PyStatus_OK(); }