bpo-34523: Support surrogatepass in locale codecs (GH-8995)

Add support for the "surrogatepass" error handler in
PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault()
for the UTF-8 encoding.

Changes:

* _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the
  surrogatepass error handler (_Py_ERROR_SURROGATEPASS).
* _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use
  the _Py_error_handler enum instead of "int surrogateescape" to pass
  the error handler. These functions now return -3 if the error
  handler is unknown.
* Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx()
  in test_codecs.
* Rename get_error_handler() to _Py_GetErrorHandler() and expose it
  as a private function.
* _freeze_importlib doesn't need config.filesystem_errors="strict"
  workaround anymore.
This commit is contained in:
Victor Stinner 2018-08-29 22:21:32 +02:00 committed by GitHub
parent c5989cd876
commit 3d4226a832
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 423 additions and 117 deletions

View file

@ -9,6 +9,11 @@ from unittest import mock
from test import support
try:
import _testcapi
except ImportError as exc:
_testcapi = None
try:
import ctypes
except ImportError:
@ -2051,13 +2056,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
@support.cpython_only
def test_basics_capi(self):
from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
s = "abc123" # all codecs should be able to encode these
for encoding in all_unicode_encodings:
if encoding not in broken_unicode_with_stateful:
# check incremental decoder/encoder (fetched via the C API)
try:
cencoder = codec_incrementalencoder(encoding)
cencoder = _testcapi.codec_incrementalencoder(encoding)
except LookupError: # no IncrementalEncoder
pass
else:
@ -2066,7 +2070,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
for c in s:
encodedresult += cencoder.encode(c)
encodedresult += cencoder.encode("", True)
cdecoder = codec_incrementaldecoder(encoding)
cdecoder = _testcapi.codec_incrementaldecoder(encoding)
decodedresult = ""
for c in encodedresult:
decodedresult += cdecoder.decode(bytes([c]))
@ -2077,12 +2081,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
if encoding not in ("idna", "mbcs"):
# check incremental decoder/encoder with errors argument
try:
cencoder = codec_incrementalencoder(encoding, "ignore")
cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
except LookupError: # no IncrementalEncoder
pass
else:
encodedresult = b"".join(cencoder.encode(c) for c in s)
cdecoder = codec_incrementaldecoder(encoding, "ignore")
cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
decodedresult = "".join(cdecoder.decode(bytes([c]))
for c in encodedresult)
self.assertEqual(decodedresult, s,
@ -3263,5 +3267,109 @@ class Latin1Test(unittest.TestCase):
self.assertEqual(data.decode('latin1'), expected)
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
class LocaleCodecTest(unittest.TestCase):
"""
Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
"""
ENCODING = sys.getfilesystemencoding()
STRINGS = ("ascii", "ulatin1:\xa7\xe9",
"u255:\xff",
"UCS:\xe9\u20ac\U0010ffff",
"surrogates:\uDC80\uDCFF")
BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
SURROGATES = "\uDC80\uDCFF"
def encode(self, text, errors="strict"):
return _testcapi.EncodeLocaleEx(text, 0, errors)
def check_encode_strings(self, errors):
for text in self.STRINGS:
with self.subTest(text=text):
try:
expected = text.encode(self.ENCODING, errors)
except UnicodeEncodeError:
with self.assertRaises(RuntimeError) as cm:
self.encode(self.SURROGATES)
errmsg = str(cm.exception)
self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
else:
encoded = self.encode(text, errors)
self.assertEqual(encoded, expected)
def test_encode_strict(self):
self.check_encode_strings("strict")
def test_encode_surrogateescape(self):
self.check_encode_strings("surrogateescape")
def test_encode_surrogatepass(self):
try:
self.encode('', 'surrogatepass')
except ValueError as exc:
if str(exc) == 'unsupported error handler':
self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
f"surrogatepass error handler")
else:
raise
self.check_encode_strings("surrogatepass")
def decode(self, encoded, errors="strict"):
return _testcapi.DecodeLocaleEx(encoded, 0, errors)
def check_decode_strings(self, errors):
is_utf8 = (self.ENCODING == "utf-8")
if is_utf8:
encode_errors = 'surrogateescape'
else:
encode_errors = 'strict'
strings = list(self.BYTES_STRINGS)
for text in self.STRINGS:
try:
encoded = text.encode(self.ENCODING, encode_errors)
if encoded not in strings:
strings.append(encoded)
except UnicodeEncodeError:
encoded = None
if is_utf8:
encoded2 = text.encode(self.ENCODING, 'surrogatepass')
if encoded2 != encoded:
strings.append(encoded2)
for encoded in strings:
with self.subTest(encoded=encoded):
try:
expected = encoded.decode(self.ENCODING, errors)
except UnicodeDecodeError:
with self.assertRaises(RuntimeError) as cm:
self.decode(encoded, errors)
errmsg = str(cm.exception)
self.assertTrue(errmsg.startswith("decode error: "), errmsg)
else:
decoded = self.decode(encoded, errors)
self.assertEqual(decoded, expected)
def test_decode_strict(self):
self.check_decode_strings("strict")
def test_decode_surrogateescape(self):
self.check_decode_strings("surrogateescape")
def test_decode_surrogatepass(self):
try:
self.decode(b'', 'surrogatepass')
except ValueError as exc:
if str(exc) == 'unsupported error handler':
self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
f"surrogatepass error handler")
else:
raise
self.check_decode_strings("surrogatepass")
if __name__ == "__main__":
unittest.main()