Optimize ascii/latin1+surrogateescape encoders

Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
error handler: the encoders are now up to 3 times as fast.

Initial patch written by Serhiy Storchaka.
This commit is contained in:
Victor Stinner 2015-09-29 12:32:13 +02:00
parent 5fbeabcbb6
commit c3713e9706
4 changed files with 83 additions and 0 deletions

View file

@ -117,6 +117,9 @@ Optimizations
* The ASCII decoder is now up to 60 times as fast for error handlers: * The ASCII decoder is now up to 60 times as fast for error handlers:
``surrogateescape``, ``ignore`` and ``replace``. ``surrogateescape``, ``ignore`` and ``replace``.
* The ASCII and the Latin1 encoders are now up to 3 times as fast for the error
error ``surrogateescape``.
Build and C API Changes Build and C API Changes
======================= =======================

View file

@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase):
class ASCIITest(unittest.TestCase): class ASCIITest(unittest.TestCase):
def test_encode(self):
self.assertEqual('abc123'.encode('ascii'), b'abc123')
def test_encode_error(self):
for data, error_handler, expected in (
('[\x80\xff\u20ac]', 'ignore', b'[]'),
('[\x80\xff\u20ac]', 'replace', b'[???]'),
('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
):
with self.subTest(data=data, error_handler=error_handler,
expected=expected):
self.assertEqual(data.encode('ascii', error_handler),
expected)
def test_encode_surrogateescape_error(self):
with self.assertRaises(UnicodeEncodeError):
# the first character can be decoded, but not the second
'\udc80\xff'.encode('ascii', 'surrogateescape')
def test_decode(self): def test_decode(self):
self.assertEqual(b'abc'.decode('ascii'), 'abc')
def test_decode_error(self):
for data, error_handler, expected in ( for data, error_handler, expected in (
(b'[\x80\xff]', 'ignore', '[]'), (b'[\x80\xff]', 'ignore', '[]'),
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase):
expected) expected)
class Latin1Test(unittest.TestCase):
def test_encode(self):
for data, expected in (
('abc', b'abc'),
('\x80\xe9\xff', b'\x80\xe9\xff'),
):
with self.subTest(data=data, expected=expected):
self.assertEqual(data.encode('latin1'), expected)
def test_encode_errors(self):
for data, error_handler, expected in (
('[\u20ac\udc80]', 'ignore', b'[]'),
('[\u20ac\udc80]', 'replace', b'[??]'),
('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
):
with self.subTest(data=data, error_handler=error_handler,
expected=expected):
self.assertEqual(data.encode('latin1', error_handler),
expected)
def test_encode_surrogateescape_error(self):
with self.assertRaises(UnicodeEncodeError):
# the first character can be decoded, but not the second
'\udc80\u20ac'.encode('latin1', 'surrogateescape')
def test_decode(self):
for data, expected in (
(b'abc', 'abc'),
(b'[\x80\xff]', '[\x80\xff]'),
):
with self.subTest(data=data, expected=expected):
self.assertEqual(data.decode('latin1'), expected)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View file

@ -10,6 +10,10 @@ Release date: XXXX-XX-XX
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
error handler: the encoders are now up to 3 times as fast. Initial patch
written by Serhiy Storchaka.
- Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the - Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
getrandom() function instead of the getentropy() function. The getentropy() getrandom() function instead of the getentropy() function. The getentropy()
function is blocking to generate very good quality entropy, os.urandom() function is blocking to generate very good quality entropy, os.urandom()

View file

@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode,
pos = collend; pos = collend;
break; break;
case _Py_ERROR_SURROGATEESCAPE:
for (i = collstart; i < collend; ++i) {
ch = PyUnicode_READ(kind, data, i);
if (ch < 0xdc80 || 0xdcff < ch) {
/* Not a UTF-8b surrogate */
break;
}
*str++ = (char)(ch - 0xdc00);
++pos;
}
if (i >= collend)
break;
collstart = pos;
assert(collstart != collend);
/* fallback to general error handling */
default: default:
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
encoding, reason, unicode, &exc, encoding, reason, unicode, &exc,