mirror of
https://github.com/python/cpython.git
synced 2025-10-14 02:43:49 +00:00
Close #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8)
This commit is contained in:
parent
cc9695643f
commit
2f3ca9f20e
5 changed files with 168 additions and 60 deletions
|
@ -1011,6 +1011,11 @@ particular, the following variants typically exist:
|
||||||
+-----------------+--------------------------------+--------------------------------+
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
| cp1258 | windows-1258 | Vietnamese |
|
| cp1258 | windows-1258 | Vietnamese |
|
||||||
+-----------------+--------------------------------+--------------------------------+
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
|
| cp65001 | | Windows only: Windows UTF-8 |
|
||||||
|
| | | (``CP_UTF8``) |
|
||||||
|
| | | |
|
||||||
|
| | | .. versionadded:: 3.3 |
|
||||||
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
| euc_jp | eucjp, ujis, u-jis | Japanese |
|
| euc_jp | eucjp, ujis, u-jis | Japanese |
|
||||||
+-----------------+--------------------------------+--------------------------------+
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
| euc_jis_2004 | jisx0213, eucjis2004 | Japanese |
|
| euc_jis_2004 | jisx0213, eucjis2004 | Japanese |
|
||||||
|
|
|
@ -225,6 +225,11 @@ The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
|
||||||
:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
|
:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
|
||||||
only ``replace`` to encode and ``ignore`` to decode.
|
only ``replace`` to encode and ``ignore`` to decode.
|
||||||
|
|
||||||
|
A new Windows-only codec has been added: ``cp65001`` (:issue:`13247`). It is
|
||||||
|
the Windows code page 65001 (Windows UTF-8, ``CP_UTF8``). For example, it is
|
||||||
|
used by ``sys.stdout`` if the console output code page is set to cp65001 (e.g.
|
||||||
|
using ``chcp 65001`` command).
|
||||||
|
|
||||||
Multibyte CJK decoders now resynchronize faster. They only ignore the first
|
Multibyte CJK decoders now resynchronize faster. They only ignore the first
|
||||||
byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
|
byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
|
||||||
'replace')`` now returns a ``\n`` after the replacement character.
|
'replace')`` now returns a ``\n`` after the replacement character.
|
||||||
|
|
40
Lib/encodings/cp65001.py
Normal file
40
Lib/encodings/cp65001.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
"""
|
||||||
|
Code page 65001: Windows UTF-8 (CP_UTF8).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import functools
|
||||||
|
|
||||||
|
if not hasattr(codecs, 'code_page_encode'):
|
||||||
|
raise LookupError("cp65001 encoding is only available on Windows")
|
||||||
|
|
||||||
|
### Codec APIs
|
||||||
|
|
||||||
|
encode = functools.partial(codecs.code_page_encode, 65001)
|
||||||
|
decode = functools.partial(codecs.code_page_decode, 65001)
|
||||||
|
|
||||||
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||||
|
def encode(self, input, final=False):
|
||||||
|
return encode(input, self.errors)[0]
|
||||||
|
|
||||||
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||||
|
_buffer_decode = decode
|
||||||
|
|
||||||
|
class StreamWriter(codecs.StreamWriter):
|
||||||
|
encode = encode
|
||||||
|
|
||||||
|
class StreamReader(codecs.StreamReader):
|
||||||
|
decode = decode
|
||||||
|
|
||||||
|
### encodings module API
|
||||||
|
|
||||||
|
def getregentry():
|
||||||
|
return codecs.CodecInfo(
|
||||||
|
name='cp65001',
|
||||||
|
encode=encode,
|
||||||
|
decode=decode,
|
||||||
|
incrementalencoder=IncrementalEncoder,
|
||||||
|
incrementaldecoder=IncrementalDecoder,
|
||||||
|
streamreader=StreamReader,
|
||||||
|
streamwriter=StreamWriter,
|
||||||
|
)
|
|
@ -4,6 +4,11 @@ import codecs
|
||||||
import locale
|
import locale
|
||||||
import sys, _testcapi, io
|
import sys, _testcapi, io
|
||||||
|
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
|
||||||
|
else:
|
||||||
|
VISTA_OR_LATER = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ctypes
|
import ctypes
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -636,6 +641,107 @@ class UTF8Test(ReadTest):
|
||||||
"\U00010fff\uD800")
|
"\U00010fff\uD800")
|
||||||
self.assertTrue(codecs.lookup_error("surrogatepass"))
|
self.assertTrue(codecs.lookup_error("surrogatepass"))
|
||||||
|
|
||||||
|
@unittest.skipUnless(sys.platform == 'win32',
|
||||||
|
'cp65001 is a Windows-only codec')
|
||||||
|
class CP65001Test(ReadTest):
|
||||||
|
encoding = "cp65001"
|
||||||
|
|
||||||
|
def test_encode(self):
|
||||||
|
tests = [
|
||||||
|
('abc', 'strict', b'abc'),
|
||||||
|
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
|
||||||
|
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
|
||||||
|
]
|
||||||
|
if VISTA_OR_LATER:
|
||||||
|
tests.extend((
|
||||||
|
('\udc80', 'strict', None),
|
||||||
|
('\udc80', 'ignore', b''),
|
||||||
|
('\udc80', 'replace', b'?'),
|
||||||
|
('\udc80', 'backslashreplace', b'\\udc80'),
|
||||||
|
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
|
||||||
|
for text, errors, expected in tests:
|
||||||
|
if expected is not None:
|
||||||
|
try:
|
||||||
|
encoded = text.encode('cp65001', errors)
|
||||||
|
except UnicodeEncodeError as err:
|
||||||
|
self.fail('Unable to encode %a to cp65001 with '
|
||||||
|
'errors=%r: %s' % (text, errors, err))
|
||||||
|
self.assertEqual(encoded, expected,
|
||||||
|
'%a.encode("cp65001", %r)=%a != %a'
|
||||||
|
% (text, errors, encoded, expected))
|
||||||
|
else:
|
||||||
|
self.assertRaises(UnicodeEncodeError,
|
||||||
|
text.encode, "cp65001", errors)
|
||||||
|
|
||||||
|
def test_decode(self):
|
||||||
|
tests = [
|
||||||
|
(b'abc', 'strict', 'abc'),
|
||||||
|
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
|
||||||
|
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
|
||||||
|
(b'\xef\xbf\xbd', 'strict', '\ufffd'),
|
||||||
|
(b'[\xc3\xa9]', 'strict', '[\xe9]'),
|
||||||
|
# invalid bytes
|
||||||
|
(b'[\xff]', 'strict', None),
|
||||||
|
(b'[\xff]', 'ignore', '[]'),
|
||||||
|
(b'[\xff]', 'replace', '[\ufffd]'),
|
||||||
|
(b'[\xff]', 'surrogateescape', '[\udcff]'),
|
||||||
|
]
|
||||||
|
if VISTA_OR_LATER:
|
||||||
|
tests.extend((
|
||||||
|
(b'[\xed\xb2\x80]', 'strict', None),
|
||||||
|
(b'[\xed\xb2\x80]', 'ignore', '[]'),
|
||||||
|
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
tests.extend((
|
||||||
|
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
|
||||||
|
))
|
||||||
|
for raw, errors, expected in tests:
|
||||||
|
if expected is not None:
|
||||||
|
try:
|
||||||
|
decoded = raw.decode('cp65001', errors)
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
self.fail('Unable to decode %a from cp65001 with '
|
||||||
|
'errors=%r: %s' % (raw, errors, err))
|
||||||
|
self.assertEqual(decoded, expected,
|
||||||
|
'%a.decode("cp65001", %r)=%a != %a'
|
||||||
|
% (raw, errors, decoded, expected))
|
||||||
|
else:
|
||||||
|
self.assertRaises(UnicodeDecodeError,
|
||||||
|
raw.decode, 'cp65001', errors)
|
||||||
|
|
||||||
|
@unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
|
||||||
|
def test_lone_surrogates(self):
|
||||||
|
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
|
||||||
|
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
|
||||||
|
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
|
||||||
|
b'[\\udc80]')
|
||||||
|
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
|
||||||
|
b'[�]')
|
||||||
|
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
|
||||||
|
b'[\x80]')
|
||||||
|
self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
|
||||||
|
b'[]')
|
||||||
|
self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
|
||||||
|
b'[?]')
|
||||||
|
|
||||||
|
@unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
|
||||||
|
def test_surrogatepass_handler(self):
|
||||||
|
self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
|
||||||
|
b"abc\xed\xa0\x80def")
|
||||||
|
self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
|
||||||
|
"abc\ud800def")
|
||||||
|
self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
|
||||||
|
b"\xf0\x90\xbf\xbf\xed\xa0\x80")
|
||||||
|
self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
|
||||||
|
"\U00010fff\uD800")
|
||||||
|
self.assertTrue(codecs.lookup_error("surrogatepass"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class UTF7Test(ReadTest):
|
class UTF7Test(ReadTest):
|
||||||
encoding = "utf-7"
|
encoding = "utf-7"
|
||||||
|
|
||||||
|
@ -1747,11 +1853,9 @@ class TransformCodecTest(unittest.TestCase):
|
||||||
@unittest.skipUnless(sys.platform == 'win32',
|
@unittest.skipUnless(sys.platform == 'win32',
|
||||||
'code pages are specific to Windows')
|
'code pages are specific to Windows')
|
||||||
class CodePageTest(unittest.TestCase):
|
class CodePageTest(unittest.TestCase):
|
||||||
|
# CP_UTF8 is already tested by CP65001Test
|
||||||
CP_UTF8 = 65001
|
CP_UTF8 = 65001
|
||||||
|
|
||||||
def vista_or_later(self):
|
|
||||||
return (sys.getwindowsversion().major >= 6)
|
|
||||||
|
|
||||||
def test_invalid_code_page(self):
|
def test_invalid_code_page(self):
|
||||||
self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
|
self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
|
||||||
self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
|
self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
|
||||||
|
@ -1804,19 +1908,22 @@ class CodePageTest(unittest.TestCase):
|
||||||
self.check_encode(932, (
|
self.check_encode(932, (
|
||||||
('abc', 'strict', b'abc'),
|
('abc', 'strict', b'abc'),
|
||||||
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
|
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
|
||||||
# not encodable
|
# test error handlers
|
||||||
('\xff', 'strict', None),
|
('\xff', 'strict', None),
|
||||||
('[\xff]', 'ignore', b'[]'),
|
('[\xff]', 'ignore', b'[]'),
|
||||||
('[\xff]', 'replace', b'[y]'),
|
('[\xff]', 'replace', b'[y]'),
|
||||||
('[\u20ac]', 'replace', b'[?]'),
|
('[\u20ac]', 'replace', b'[?]'),
|
||||||
|
('[\xff]', 'backslashreplace', b'[\\xff]'),
|
||||||
|
('[\xff]', 'xmlcharrefreplace', b'[ÿ]'),
|
||||||
))
|
))
|
||||||
self.check_decode(932, (
|
self.check_decode(932, (
|
||||||
(b'abc', 'strict', 'abc'),
|
(b'abc', 'strict', 'abc'),
|
||||||
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
|
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
|
||||||
# invalid bytes
|
# invalid bytes
|
||||||
(b'\xff', 'strict', None),
|
(b'[\xff]', 'strict', None),
|
||||||
(b'\xff', 'ignore', ''),
|
(b'[\xff]', 'ignore', '[]'),
|
||||||
(b'\xff', 'replace', '\ufffd'),
|
(b'[\xff]', 'replace', '[\ufffd]'),
|
||||||
|
(b'[\xff]', 'surrogateescape', '[\udcff]'),
|
||||||
(b'\x81\x00abc', 'strict', None),
|
(b'\x81\x00abc', 'strict', None),
|
||||||
(b'\x81\x00abc', 'ignore', '\x00abc'),
|
(b'\x81\x00abc', 'ignore', '\x00abc'),
|
||||||
(b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
|
(b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
|
||||||
|
@ -1857,58 +1964,6 @@ class CodePageTest(unittest.TestCase):
|
||||||
(b'[\xff]', 'strict', '[\xff]'),
|
(b'[\xff]', 'strict', '[\xff]'),
|
||||||
))
|
))
|
||||||
|
|
||||||
def test_cp_utf8(self):
|
|
||||||
cp = self.CP_UTF8
|
|
||||||
|
|
||||||
tests = [
|
|
||||||
('abc', 'strict', b'abc'),
|
|
||||||
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
|
|
||||||
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
|
|
||||||
]
|
|
||||||
if self.vista_or_later():
|
|
||||||
tests.append(('\udc80', 'strict', None))
|
|
||||||
tests.append(('\udc80', 'ignore', b''))
|
|
||||||
tests.append(('\udc80', 'replace', b'?'))
|
|
||||||
else:
|
|
||||||
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
|
|
||||||
self.check_encode(cp, tests)
|
|
||||||
|
|
||||||
tests = [
|
|
||||||
(b'abc', 'strict', 'abc'),
|
|
||||||
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
|
|
||||||
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
|
|
||||||
(b'\xef\xbf\xbd', 'strict', '\ufffd'),
|
|
||||||
(b'[\xc3\xa9]', 'strict', '[\xe9]'),
|
|
||||||
# invalid bytes
|
|
||||||
(b'[\xff]', 'strict', None),
|
|
||||||
(b'[\xff]', 'ignore', '[]'),
|
|
||||||
(b'[\xff]', 'replace', '[\ufffd]'),
|
|
||||||
]
|
|
||||||
if self.vista_or_later():
|
|
||||||
tests.extend((
|
|
||||||
(b'[\xed\xb2\x80]', 'strict', None),
|
|
||||||
(b'[\xed\xb2\x80]', 'ignore', '[]'),
|
|
||||||
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
tests.extend((
|
|
||||||
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
|
|
||||||
))
|
|
||||||
self.check_decode(cp, tests)
|
|
||||||
|
|
||||||
def test_error_handlers(self):
|
|
||||||
self.check_encode(932, (
|
|
||||||
('\xff', 'backslashreplace', b'\\xff'),
|
|
||||||
('\xff', 'xmlcharrefreplace', b'ÿ'),
|
|
||||||
))
|
|
||||||
self.check_decode(932, (
|
|
||||||
(b'\xff', 'surrogateescape', '\udcff'),
|
|
||||||
))
|
|
||||||
if self.vista_or_later():
|
|
||||||
self.check_encode(self.CP_UTF8, (
|
|
||||||
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
|
|
||||||
))
|
|
||||||
|
|
||||||
def test_multibyte_encoding(self):
|
def test_multibyte_encoding(self):
|
||||||
self.check_decode(932, (
|
self.check_decode(932, (
|
||||||
(b'\x84\xe9\x80', 'ignore', '\u9a3e'),
|
(b'\x84\xe9\x80', 'ignore', '\u9a3e'),
|
||||||
|
@ -1918,7 +1973,7 @@ class CodePageTest(unittest.TestCase):
|
||||||
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
|
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
|
||||||
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
|
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
|
||||||
))
|
))
|
||||||
if self.vista_or_later():
|
if VISTA_OR_LATER:
|
||||||
self.check_encode(self.CP_UTF8, (
|
self.check_encode(self.CP_UTF8, (
|
||||||
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
|
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
|
||||||
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
|
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
|
||||||
|
@ -1951,6 +2006,7 @@ def test_main():
|
||||||
UTF16BETest,
|
UTF16BETest,
|
||||||
UTF8Test,
|
UTF8Test,
|
||||||
UTF8SigTest,
|
UTF8SigTest,
|
||||||
|
CP65001Test,
|
||||||
UTF7Test,
|
UTF7Test,
|
||||||
UTF16ExTest,
|
UTF16ExTest,
|
||||||
ReadBufferTest,
|
ReadBufferTest,
|
||||||
|
|
|
@ -341,6 +341,8 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8).
|
||||||
|
|
||||||
- Issue #13226: Add RTLD_xxx constants to the os module. These constants can be
|
- Issue #13226: Add RTLD_xxx constants to the os module. These constants can be
|
||||||
used with sys.setdlopenflags().
|
used with sys.setdlopenflags().
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue