mirror of
https://github.com/python/cpython.git
synced 2025-10-14 18:59:46 +00:00
Optimize ascii/latin1+surrogateescape encoders
Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape`` error handler: the encoders are now up to 3 times as fast. Initial patch written by Serhiy Storchaka.
This commit is contained in:
parent
5fbeabcbb6
commit
c3713e9706
4 changed files with 83 additions and 0 deletions
|
@ -117,6 +117,9 @@ Optimizations
|
||||||
* The ASCII decoder is now up to 60 times as fast for error handlers:
|
* The ASCII decoder is now up to 60 times as fast for error handlers:
|
||||||
``surrogateescape``, ``ignore`` and ``replace``.
|
``surrogateescape``, ``ignore`` and ``replace``.
|
||||||
|
|
||||||
|
* The ASCII and the Latin1 encoders are now up to 3 times as fast for the error
|
||||||
|
error ``surrogateescape``.
|
||||||
|
|
||||||
|
|
||||||
Build and C API Changes
|
Build and C API Changes
|
||||||
=======================
|
=======================
|
||||||
|
|
|
@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase):
|
||||||
|
|
||||||
|
|
||||||
class ASCIITest(unittest.TestCase):
|
class ASCIITest(unittest.TestCase):
|
||||||
|
def test_encode(self):
|
||||||
|
self.assertEqual('abc123'.encode('ascii'), b'abc123')
|
||||||
|
|
||||||
|
def test_encode_error(self):
|
||||||
|
for data, error_handler, expected in (
|
||||||
|
('[\x80\xff\u20ac]', 'ignore', b'[]'),
|
||||||
|
('[\x80\xff\u20ac]', 'replace', b'[???]'),
|
||||||
|
('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
|
||||||
|
('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
|
||||||
|
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
|
||||||
|
):
|
||||||
|
with self.subTest(data=data, error_handler=error_handler,
|
||||||
|
expected=expected):
|
||||||
|
self.assertEqual(data.encode('ascii', error_handler),
|
||||||
|
expected)
|
||||||
|
|
||||||
|
def test_encode_surrogateescape_error(self):
|
||||||
|
with self.assertRaises(UnicodeEncodeError):
|
||||||
|
# the first character can be decoded, but not the second
|
||||||
|
'\udc80\xff'.encode('ascii', 'surrogateescape')
|
||||||
|
|
||||||
def test_decode(self):
|
def test_decode(self):
|
||||||
|
self.assertEqual(b'abc'.decode('ascii'), 'abc')
|
||||||
|
|
||||||
|
def test_decode_error(self):
|
||||||
for data, error_handler, expected in (
|
for data, error_handler, expected in (
|
||||||
(b'[\x80\xff]', 'ignore', '[]'),
|
(b'[\x80\xff]', 'ignore', '[]'),
|
||||||
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
|
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
|
||||||
|
@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase):
|
||||||
expected)
|
expected)
|
||||||
|
|
||||||
|
|
||||||
|
class Latin1Test(unittest.TestCase):
|
||||||
|
def test_encode(self):
|
||||||
|
for data, expected in (
|
||||||
|
('abc', b'abc'),
|
||||||
|
('\x80\xe9\xff', b'\x80\xe9\xff'),
|
||||||
|
):
|
||||||
|
with self.subTest(data=data, expected=expected):
|
||||||
|
self.assertEqual(data.encode('latin1'), expected)
|
||||||
|
|
||||||
|
def test_encode_errors(self):
|
||||||
|
for data, error_handler, expected in (
|
||||||
|
('[\u20ac\udc80]', 'ignore', b'[]'),
|
||||||
|
('[\u20ac\udc80]', 'replace', b'[??]'),
|
||||||
|
('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
|
||||||
|
('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
|
||||||
|
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
|
||||||
|
):
|
||||||
|
with self.subTest(data=data, error_handler=error_handler,
|
||||||
|
expected=expected):
|
||||||
|
self.assertEqual(data.encode('latin1', error_handler),
|
||||||
|
expected)
|
||||||
|
|
||||||
|
def test_encode_surrogateescape_error(self):
|
||||||
|
with self.assertRaises(UnicodeEncodeError):
|
||||||
|
# the first character can be decoded, but not the second
|
||||||
|
'\udc80\u20ac'.encode('latin1', 'surrogateescape')
|
||||||
|
|
||||||
|
def test_decode(self):
|
||||||
|
for data, expected in (
|
||||||
|
(b'abc', 'abc'),
|
||||||
|
(b'[\x80\xff]', '[\x80\xff]'),
|
||||||
|
):
|
||||||
|
with self.subTest(data=data, expected=expected):
|
||||||
|
self.assertEqual(data.decode('latin1'), expected)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -10,6 +10,10 @@ Release date: XXXX-XX-XX
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
|
||||||
|
error handler: the encoders are now up to 3 times as fast. Initial patch
|
||||||
|
written by Serhiy Storchaka.
|
||||||
|
|
||||||
- Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
|
- Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
|
||||||
getrandom() function instead of the getentropy() function. The getentropy()
|
getrandom() function instead of the getentropy() function. The getentropy()
|
||||||
function is blocking to generate very good quality entropy, os.urandom()
|
function is blocking to generate very good quality entropy, os.urandom()
|
||||||
|
|
|
@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode,
|
||||||
pos = collend;
|
pos = collend;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case _Py_ERROR_SURROGATEESCAPE:
|
||||||
|
for (i = collstart; i < collend; ++i) {
|
||||||
|
ch = PyUnicode_READ(kind, data, i);
|
||||||
|
if (ch < 0xdc80 || 0xdcff < ch) {
|
||||||
|
/* Not a UTF-8b surrogate */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
*str++ = (char)(ch - 0xdc00);
|
||||||
|
++pos;
|
||||||
|
}
|
||||||
|
if (i >= collend)
|
||||||
|
break;
|
||||||
|
collstart = pos;
|
||||||
|
assert(collstart != collend);
|
||||||
|
/* fallback to general error handling */
|
||||||
|
|
||||||
default:
|
default:
|
||||||
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
|
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
|
||||||
encoding, reason, unicode, &exc,
|
encoding, reason, unicode, &exc,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue