mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Issue #20574: Implement incremental decoder for cp65001 code
(Windows code page 65001, Microsoft UTF-8).
This commit is contained in:
parent
c49926748b
commit
7d00cc1a64
4 changed files with 22 additions and 43 deletions
|
@ -11,20 +11,23 @@ if not hasattr(codecs, 'code_page_encode'):
|
||||||
### Codec APIs
|
### Codec APIs
|
||||||
|
|
||||||
encode = functools.partial(codecs.code_page_encode, 65001)
|
encode = functools.partial(codecs.code_page_encode, 65001)
|
||||||
decode = functools.partial(codecs.code_page_decode, 65001)
|
_decode = functools.partial(codecs.code_page_decode, 65001)
|
||||||
|
|
||||||
|
def decode(input, errors='strict'):
|
||||||
|
return codecs.code_page_decode(65001, input, errors, True)
|
||||||
|
|
||||||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||||
def encode(self, input, final=False):
|
def encode(self, input, final=False):
|
||||||
return encode(input, self.errors)[0]
|
return encode(input, self.errors)[0]
|
||||||
|
|
||||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||||
_buffer_decode = decode
|
_buffer_decode = _decode
|
||||||
|
|
||||||
class StreamWriter(codecs.StreamWriter):
|
class StreamWriter(codecs.StreamWriter):
|
||||||
encode = encode
|
encode = encode
|
||||||
|
|
||||||
class StreamReader(codecs.StreamReader):
|
class StreamReader(codecs.StreamReader):
|
||||||
decode = decode
|
decode = _decode
|
||||||
|
|
||||||
### encodings module API
|
### encodings module API
|
||||||
|
|
||||||
|
|
|
@ -890,10 +890,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
|
||||||
"\U00010fff\uD800")
|
"\U00010fff\uD800")
|
||||||
self.assertTrue(codecs.lookup_error("surrogatepass"))
|
self.assertTrue(codecs.lookup_error("surrogatepass"))
|
||||||
|
|
||||||
def test_readline(self):
|
|
||||||
self.skipTest("issue #20571: code page 65001 codec does not "
|
|
||||||
"support partial decoder yet")
|
|
||||||
|
|
||||||
|
|
||||||
class UTF7Test(ReadTest, unittest.TestCase):
|
class UTF7Test(ReadTest, unittest.TestCase):
|
||||||
encoding = "utf-7"
|
encoding = "utf-7"
|
||||||
|
@ -2750,15 +2746,15 @@ class CodePageTest(unittest.TestCase):
|
||||||
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
|
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
|
||||||
codecs.code_page_encode, 932, '\xff')
|
codecs.code_page_encode, 932, '\xff')
|
||||||
self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
|
self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
|
||||||
codecs.code_page_decode, 932, b'\x81\x00')
|
codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
|
||||||
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
|
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
|
||||||
codecs.code_page_decode, self.CP_UTF8, b'\xff')
|
codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
|
||||||
|
|
||||||
def check_decode(self, cp, tests):
|
def check_decode(self, cp, tests):
|
||||||
for raw, errors, expected in tests:
|
for raw, errors, expected in tests:
|
||||||
if expected is not None:
|
if expected is not None:
|
||||||
try:
|
try:
|
||||||
decoded = codecs.code_page_decode(cp, raw, errors)
|
decoded = codecs.code_page_decode(cp, raw, errors, True)
|
||||||
except UnicodeDecodeError as err:
|
except UnicodeDecodeError as err:
|
||||||
self.fail('Unable to decode %a from "cp%s" with '
|
self.fail('Unable to decode %a from "cp%s" with '
|
||||||
'errors=%r: %s' % (raw, cp, errors, err))
|
'errors=%r: %s' % (raw, cp, errors, err))
|
||||||
|
@ -2770,7 +2766,7 @@ class CodePageTest(unittest.TestCase):
|
||||||
self.assertLessEqual(decoded[1], len(raw))
|
self.assertLessEqual(decoded[1], len(raw))
|
||||||
else:
|
else:
|
||||||
self.assertRaises(UnicodeDecodeError,
|
self.assertRaises(UnicodeDecodeError,
|
||||||
codecs.code_page_decode, cp, raw, errors)
|
codecs.code_page_decode, cp, raw, errors, True)
|
||||||
|
|
||||||
def check_encode(self, cp, tests):
|
def check_encode(self, cp, tests):
|
||||||
for text, errors, expected in tests:
|
for text, errors, expected in tests:
|
||||||
|
|
|
@ -13,6 +13,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #20574: Implement incremental decoder for cp65001 code (Windows code
|
||||||
|
page 65001, Microsoft UTF-8).
|
||||||
|
|
||||||
- Issue #20879: Delay the initialization of encoding and decoding tables for
|
- Issue #20879: Delay the initialization of encoding and decoding tables for
|
||||||
base32, ascii85 and base85 codecs in the base64 module, and delay the
|
base32, ascii85 and base85 codecs in the base64 module, and delay the
|
||||||
initialization of the unquote_to_bytes() table of the urllib.parse module, to
|
initialization of the unquote_to_bytes() table of the urllib.parse module, to
|
||||||
|
|
|
@ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj)
|
||||||
return PyBytes_AS_STRING(*obj);
|
return PyBytes_AS_STRING(*obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
|
||||||
is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
|
|
||||||
{
|
|
||||||
const char *curr = s + offset;
|
|
||||||
const char *prev;
|
|
||||||
|
|
||||||
if (!IsDBCSLeadByteEx(code_page, *curr))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
prev = CharPrevExA(code_page, s, curr, 0);
|
|
||||||
if (prev == curr)
|
|
||||||
return 1;
|
|
||||||
/* FIXME: This code is limited to "true" double-byte encodings,
|
|
||||||
as it assumes an incomplete character consists of a single
|
|
||||||
byte. */
|
|
||||||
if (curr - prev == 2)
|
|
||||||
return 1;
|
|
||||||
if (!IsDBCSLeadByteEx(code_page, *prev))
|
|
||||||
return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static DWORD
|
static DWORD
|
||||||
decode_code_page_flags(UINT code_page)
|
decode_code_page_flags(UINT code_page)
|
||||||
{
|
{
|
||||||
|
@ -6913,7 +6891,7 @@ static int
|
||||||
decode_code_page_errors(UINT code_page,
|
decode_code_page_errors(UINT code_page,
|
||||||
PyObject **v,
|
PyObject **v,
|
||||||
const char *in, const int size,
|
const char *in, const int size,
|
||||||
const char *errors)
|
const char *errors, int final)
|
||||||
{
|
{
|
||||||
const char *startin = in;
|
const char *startin = in;
|
||||||
const char *endin = in + size;
|
const char *endin = in + size;
|
||||||
|
@ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page,
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (errors == NULL || strcmp(errors, "strict") == 0) {
|
if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
|
||||||
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
|
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
|
||||||
UnicodeDecodeError. */
|
UnicodeDecodeError. */
|
||||||
make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
|
make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
|
||||||
|
@ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page,
|
||||||
if (outsize <= 0) {
|
if (outsize <= 0) {
|
||||||
Py_ssize_t startinpos, endinpos, outpos;
|
Py_ssize_t startinpos, endinpos, outpos;
|
||||||
|
|
||||||
|
/* last character in partial decode? */
|
||||||
|
if (in + insize >= endin && !final)
|
||||||
|
break;
|
||||||
|
|
||||||
startinpos = in - startin;
|
startinpos = in - startin;
|
||||||
endinpos = startinpos + 1;
|
endinpos = startinpos + 1;
|
||||||
outpos = out - PyUnicode_AS_UNICODE(*v);
|
outpos = out - PyUnicode_AS_UNICODE(*v);
|
||||||
|
@ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page,
|
||||||
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
|
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
|
||||||
if (unicode_resize(v, outsize) < 0)
|
if (unicode_resize(v, outsize) < 0)
|
||||||
goto error;
|
goto error;
|
||||||
ret = size;
|
ret = in - startin;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
Py_XDECREF(encoding_obj);
|
Py_XDECREF(encoding_obj);
|
||||||
|
@ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page,
|
||||||
done = 1;
|
done = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Skip trailing lead-byte unless 'final' is set */
|
|
||||||
if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
|
|
||||||
--chunk_size;
|
|
||||||
|
|
||||||
if (chunk_size == 0 && done) {
|
if (chunk_size == 0 && done) {
|
||||||
if (v != NULL)
|
if (v != NULL)
|
||||||
break;
|
break;
|
||||||
_Py_RETURN_UNICODE_EMPTY();
|
_Py_RETURN_UNICODE_EMPTY();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
converted = decode_code_page_strict(code_page, &v,
|
converted = decode_code_page_strict(code_page, &v,
|
||||||
s, chunk_size);
|
s, chunk_size);
|
||||||
if (converted == -2)
|
if (converted == -2)
|
||||||
converted = decode_code_page_errors(code_page, &v,
|
converted = decode_code_page_errors(code_page, &v,
|
||||||
s, chunk_size,
|
s, chunk_size,
|
||||||
errors);
|
errors, final);
|
||||||
assert(converted != 0);
|
assert(converted != 0 || done);
|
||||||
|
|
||||||
if (converted < 0) {
|
if (converted < 0) {
|
||||||
Py_XDECREF(v);
|
Py_XDECREF(v);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue