Optimize ascii/latin1+surrogateescape encoders

Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape`` error handler: the encoders are now up to 3 times as fast. Initial patch written by Serhiy Storchaka.
2025-10-14 18:59:46 +00:00 · 2015-09-29 12:32:13 +02:00 · 2015-09-29 12:32:13 +02:00 · c3713e9706
commit c3713e9706
parent 5fbeabcbb6
4 changed files with 83 additions and 0 deletions
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@ -117,6 +117,9 @@ Optimizations
 * The ASCII decoder is now up to 60 times as fast for error handlers:
  ``surrogateescape``, ``ignore`` and ``replace``.
 * The ASCII and the Latin1 encoders are now up to 3 times as fast for the error
  error ``surrogateescape``.
 Build and C API Changes
 =======================
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase):
 class ASCIITest(unittest.TestCase):
    def test_encode(self):
        self.assertEqual('abc123'.encode('ascii'), b'abc123')
    def test_encode_error(self):
        for data, error_handler, expected in (
            ('[\x80\xff\u20ac]', 'ignore', b'[]'),
            ('[\x80\xff\u20ac]', 'replace', b'[???]'),
            ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
            ('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
        ):
            with self.subTest(data=data, error_handler=error_handler,
                              expected=expected):
                self.assertEqual(data.encode('ascii', error_handler),
                                 expected)
    def test_encode_surrogateescape_error(self):
        with self.assertRaises(UnicodeEncodeError):
            # the first character can be decoded, but not the second
            '\udc80\xff'.encode('ascii', 'surrogateescape')
    def test_decode(self):
        self.assertEqual(b'abc'.decode('ascii'), 'abc')
    def test_decode_error(self):
        for data, error_handler, expected in (
            (b'[\x80\xff]', 'ignore', '[]'),
            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase):
                                 expected)
 class Latin1Test(unittest.TestCase):
    def test_encode(self):
        for data, expected in (
            ('abc', b'abc'),
            ('\x80\xe9\xff', b'\x80\xe9\xff'),
        ):
            with self.subTest(data=data, expected=expected):
                self.assertEqual(data.encode('latin1'), expected)
    def test_encode_errors(self):
        for data, error_handler, expected in (
            ('[\u20ac\udc80]', 'ignore', b'[]'),
            ('[\u20ac\udc80]', 'replace', b'[??]'),
            ('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
            ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
        ):
            with self.subTest(data=data, error_handler=error_handler,
                              expected=expected):
                self.assertEqual(data.encode('latin1', error_handler),
                                 expected)
    def test_encode_surrogateescape_error(self):
        with self.assertRaises(UnicodeEncodeError):
            # the first character can be decoded, but not the second
            '\udc80\u20ac'.encode('latin1', 'surrogateescape')
    def test_decode(self):
        for data, expected in (
            (b'abc', 'abc'),
            (b'[\x80\xff]', '[\x80\xff]'),
        ):
            with self.subTest(data=data, expected=expected):
                self.assertEqual(data.decode('latin1'), expected)
 if __name__ == "__main__":
    unittest.main()
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,10 @@ Release date: XXXX-XX-XX
 Core and Builtins
 -----------------
 - Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
  error handler: the encoders are now up to 3 times as fast. Initial patch
  written by Serhiy Storchaka.
 - Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
  getrandom() function instead of the getentropy() function. The getentropy()
  function is blocking to generate very good quality entropy, os.urandom()
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode,
                pos = collend;
                break;
            case _Py_ERROR_SURROGATEESCAPE:
                for (i = collstart; i < collend; ++i) {
                    ch = PyUnicode_READ(kind, data, i);
                    if (ch < 0xdc80 || 0xdcff < ch) {
                        /* Not a UTF-8b surrogate */
                        break;
                    }
                    *str++ = (char)(ch - 0xdc00);
                    ++pos;
                }
                if (i >= collend)
                    break;
                collstart = pos;
                assert(collstart != collend);
                /* fallback to general error handling */
            default:
                repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
                                                              encoding, reason, unicode, &exc,