GH-133711: Enable UTF-8 mode by default (PEP 686) (#133712)

Co-authored-by: Victor Stinner <vstinner@python.org>
2025-12-22 08:29:12 +00:00 · 2025-07-15 10:45:41 +01:00 · 2025-07-15 10:45:41 +01:00 · c89a66feb1
commit c89a66feb1
parent f320c951c3
14 changed files with 93 additions and 85 deletions
--- a/Lib/locale.py
+++ b/Lib/locale.py
@ -651,7 +651,8 @@ else:
        if sys.flags.warn_default_encoding:
            import warnings
            warnings.warn(
-                "UTF-8 Mode affects locale.getpreferredencoding(). Consider locale.getencoding() instead.",
+                "UTF-8 Mode affects locale.getpreferredencoding(). "
+                "Consider locale.getencoding() instead.",
                EncodingWarning, 2)
        if sys.flags.utf8_mode:
            return 'utf-8'
--- a/Lib/subprocess.py
+++ b/Lib/subprocess.py
@ -380,8 +380,7 @@ def _text_encoding():

    if sys.flags.utf8_mode:
        return "utf-8"
-    else:
-        return locale.getencoding()
+    return locale.getencoding()


 def call(*popenargs, timeout=None, **kwargs):
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@ -300,6 +300,10 @@ class CmdLineTest(unittest.TestCase):
            cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
            return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)

+        def run_no_utf8_mode(arg):
+            cmd = [sys.executable, '-X', 'utf8=0', '-c', code, arg]
+            return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
        valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
        # invalid UTF-8 byte sequences with a valid UTF-8 sequence
        # in the middle.
@ -312,7 +316,8 @@ class CmdLineTest(unittest.TestCase):
        )
        test_args = [valid_utf8, invalid_utf8]

-        for run_cmd in (run_default, run_c_locale, run_utf8_mode):
+        for run_cmd in (run_default, run_c_locale, run_utf8_mode,
+                        run_no_utf8_mode):
            with self.subTest(run_cmd=run_cmd):
                for arg in test_args:
                    proc = run_cmd(arg)
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@ -543,7 +543,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
        'configure_locale': True,
        'coerce_c_locale': False,
        'coerce_c_locale_warn': False,
-        'utf8_mode': False,
+        'utf8_mode': True,
    }
    if MS_WINDOWS:
        PRE_CONFIG_COMPAT.update({
@ -560,7 +560,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
        configure_locale=False,
        isolated=True,
        use_environment=False,
-        utf8_mode=False,
+        utf8_mode=True,
        dev_mode=False,
        coerce_c_locale=False,
    )
@ -805,12 +805,6 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
                        'stdio_encoding', 'stdio_errors'):
                expected[key] = self.IGNORE_CONFIG

-        if not expected_preconfig['configure_locale']:
-            # UTF-8 Mode depends on the locale. There is no easy way
-            # to guess if UTF-8 Mode will be enabled or not if the locale
-            # is not configured.
-            expected_preconfig['utf8_mode'] = self.IGNORE_CONFIG
-
        if expected_preconfig['utf8_mode'] == 1:
            if expected['filesystem_encoding'] is self.GET_DEFAULT_CONFIG:
                expected['filesystem_encoding'] = 'utf-8'
--- a/Lib/test/test_utf8_mode.py
+++ b/Lib/test/test_utf8_mode.py
@ -89,8 +89,8 @@ class UTF8ModeTests(unittest.TestCase):
        # the UTF-8 mode
        if not self.posix_locale():
            # PYTHONUTF8 should be ignored if -E is used
-            out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
-            self.assertEqual(out, '0')
+            out = self.get_output('-E', '-c', code, PYTHONUTF8='0')
+            self.assertEqual(out, '1')

        # invalid mode
        out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
@ -116,7 +116,7 @@ class UTF8ModeTests(unittest.TestCase):
            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
            # and has the priority over -X utf8 and PYTHONUTF8
            out = self.get_output('-X', 'utf8', '-c', code,
-                                  PYTHONUTF8='strict',
+                                  PYTHONUTF8='xxx',
                                  PYTHONLEGACYWINDOWSFSENCODING='1')
            self.assertEqual(out, 'mbcs/replace')