mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
SF bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
about illegal code points. The codec now supports PEP 293 style error handlers. (This is a variant of the Nik Haldimann's patch that detects truncated data)
This commit is contained in:
parent
523c9f0709
commit
a47d1c08d0
6 changed files with 173 additions and 5 deletions
|
@ -797,6 +797,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
|
||||||
int length /* Number of Py_UNICODE chars to encode */
|
int length /* Number of Py_UNICODE chars to encode */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* --- Unicode Internal Codec ---------------------------------------------
|
||||||
|
|
||||||
|
Only for internal use in _codecsmodule.c */
|
||||||
|
|
||||||
|
PyObject *_PyUnicode_DecodeUnicodeInternal(
|
||||||
|
const char *string,
|
||||||
|
int length,
|
||||||
|
const char *errors
|
||||||
|
);
|
||||||
|
|
||||||
/* --- Latin-1 Codecs -----------------------------------------------------
|
/* --- Latin-1 Codecs -----------------------------------------------------
|
||||||
|
|
||||||
Note: Latin-1 corresponds to the first 256 Unicode ordinals.
|
Note: Latin-1 corresponds to the first 256 Unicode ordinals.
|
||||||
|
|
|
@ -111,7 +111,7 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
sout += "\\U%08x" % sys.maxunicode
|
sout += "\\U%08x" % sys.maxunicode
|
||||||
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
|
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
|
||||||
|
|
||||||
def test_relaxedutf8(self):
|
def test_decoderelaxedutf8(self):
|
||||||
# This is the test for a decoding callback handler,
|
# This is the test for a decoding callback handler,
|
||||||
# that relaxes the UTF-8 minimal encoding restriction.
|
# that relaxes the UTF-8 minimal encoding restriction.
|
||||||
# A null byte that is encoded as "\xc0\x80" will be
|
# A null byte that is encoded as "\xc0\x80" will be
|
||||||
|
@ -158,6 +158,35 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
charmap[ord("?")] = u"XYZ"
|
charmap[ord("?")] = u"XYZ"
|
||||||
self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
|
self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
|
||||||
|
|
||||||
|
def test_decodeunicodeinternal(self):
|
||||||
|
self.assertRaises(
|
||||||
|
UnicodeDecodeError,
|
||||||
|
"\x00\x00\x00\x00\x00".decode,
|
||||||
|
"unicode-internal",
|
||||||
|
)
|
||||||
|
if sys.maxunicode > 0xffff:
|
||||||
|
def handler_unicodeinternal(exc):
|
||||||
|
if not isinstance(exc, UnicodeDecodeError):
|
||||||
|
raise TypeError("don't know how to handle %r" % exc)
|
||||||
|
return (u"\x01", 1)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
|
||||||
|
u"\u0000"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
|
||||||
|
u"\u0000\ufffd"
|
||||||
|
)
|
||||||
|
|
||||||
|
codecs.register_error("test.hui", handler_unicodeinternal)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
|
||||||
|
u"\u0000\u0001\u0000"
|
||||||
|
)
|
||||||
|
|
||||||
def test_callbacks(self):
|
def test_callbacks(self):
|
||||||
def handler1(exc):
|
def handler1(exc):
|
||||||
if not isinstance(exc, UnicodeEncodeError) \
|
if not isinstance(exc, UnicodeEncodeError) \
|
||||||
|
@ -503,7 +532,8 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
for (enc, bytes) in (
|
for (enc, bytes) in (
|
||||||
("ascii", "\xff"),
|
("ascii", "\xff"),
|
||||||
("utf-8", "\xff"),
|
("utf-8", "\xff"),
|
||||||
("utf-7", "+x-")
|
("utf-7", "+x-"),
|
||||||
|
("unicode-internal", "\x00"),
|
||||||
):
|
):
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
TypeError,
|
TypeError,
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from test import test_support
|
from test import test_support
|
||||||
import unittest
|
import unittest
|
||||||
import codecs
|
import codecs
|
||||||
import StringIO
|
import sys, StringIO
|
||||||
|
|
||||||
class Queue(object):
|
class Queue(object):
|
||||||
"""
|
"""
|
||||||
|
@ -453,6 +453,54 @@ class PunycodeTest(unittest.TestCase):
|
||||||
for uni, puny in punycode_testcases:
|
for uni, puny in punycode_testcases:
|
||||||
self.assertEquals(uni, puny.decode("punycode"))
|
self.assertEquals(uni, puny.decode("punycode"))
|
||||||
|
|
||||||
|
class UnicodeInternalTest(unittest.TestCase):
|
||||||
|
def test_bug1251300(self):
|
||||||
|
# Decoding with unicode_internal used to not correctly handle "code
|
||||||
|
# points" above 0x10ffff on UCS-4 builds.
|
||||||
|
if sys.maxunicode > 0xffff:
|
||||||
|
ok = [
|
||||||
|
("\x00\x10\xff\xff", u"\U0010ffff"),
|
||||||
|
("\x00\x00\x01\x01", u"\U00000101"),
|
||||||
|
("", u""),
|
||||||
|
]
|
||||||
|
not_ok = [
|
||||||
|
"\x7f\xff\xff\xff",
|
||||||
|
"\x80\x00\x00\x00",
|
||||||
|
"\x81\x00\x00\x00",
|
||||||
|
"\x00",
|
||||||
|
"\x00\x00\x00\x00\x00",
|
||||||
|
]
|
||||||
|
for internal, uni in ok:
|
||||||
|
if sys.byteorder == "little":
|
||||||
|
internal = "".join(reversed(internal))
|
||||||
|
self.assertEquals(uni, internal.decode("unicode_internal"))
|
||||||
|
for internal in not_ok:
|
||||||
|
if sys.byteorder == "little":
|
||||||
|
internal = "".join(reversed(internal))
|
||||||
|
self.assertRaises(UnicodeDecodeError, internal.decode,
|
||||||
|
"unicode_internal")
|
||||||
|
|
||||||
|
def test_decode_error_attributes(self):
|
||||||
|
if sys.maxunicode > 0xffff:
|
||||||
|
try:
|
||||||
|
"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
|
||||||
|
except UnicodeDecodeError, ex:
|
||||||
|
self.assertEquals("unicode_internal", ex.encoding)
|
||||||
|
self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
|
||||||
|
self.assertEquals(4, ex.start)
|
||||||
|
self.assertEquals(8, ex.end)
|
||||||
|
else:
|
||||||
|
self.fail()
|
||||||
|
|
||||||
|
def test_decode_callback(self):
|
||||||
|
if sys.maxunicode > 0xffff:
|
||||||
|
codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
|
||||||
|
decoder = codecs.getdecoder("unicode_internal")
|
||||||
|
ab = u"ab".encode("unicode_internal")
|
||||||
|
ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
|
||||||
|
"UnicodeInternalTest")
|
||||||
|
self.assertEquals((u"ab", 12), ignored)
|
||||||
|
|
||||||
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
|
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
|
||||||
nameprep_tests = [
|
nameprep_tests = [
|
||||||
# 3.1 Map to nothing.
|
# 3.1 Map to nothing.
|
||||||
|
@ -885,6 +933,7 @@ def test_main():
|
||||||
EscapeDecodeTest,
|
EscapeDecodeTest,
|
||||||
RecodingTest,
|
RecodingTest,
|
||||||
PunycodeTest,
|
PunycodeTest,
|
||||||
|
UnicodeInternalTest,
|
||||||
NameprepTest,
|
NameprepTest,
|
||||||
CodecTest,
|
CodecTest,
|
||||||
CodecsModuleTest,
|
CodecsModuleTest,
|
||||||
|
|
|
@ -435,6 +435,10 @@ Library
|
||||||
line ending. Remove the special handling of a "\r\n" that has been split
|
line ending. Remove the special handling of a "\r\n" that has been split
|
||||||
between two lines.
|
between two lines.
|
||||||
|
|
||||||
|
- Bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
|
||||||
|
about illegal code points. The codec now supports PEP 293 style error
|
||||||
|
handlers.
|
||||||
|
|
||||||
|
|
||||||
Build
|
Build
|
||||||
-----
|
-----
|
||||||
|
|
|
@ -254,8 +254,8 @@ unicode_internal_decode(PyObject *self,
|
||||||
else {
|
else {
|
||||||
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
|
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
|
||||||
return NULL;
|
return NULL;
|
||||||
return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
|
|
||||||
size / sizeof(Py_UNICODE)),
|
return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
|
||||||
size);
|
size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
||||||
PyUnicode_GET_SIZE(unicode));
|
PyUnicode_GET_SIZE(unicode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --- Unicode Internal Codec ------------------------------------------- */
|
||||||
|
|
||||||
|
PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
|
||||||
|
int size,
|
||||||
|
const char *errors)
|
||||||
|
{
|
||||||
|
const char *starts = s;
|
||||||
|
int startinpos;
|
||||||
|
int endinpos;
|
||||||
|
int outpos;
|
||||||
|
Py_UNICODE unimax;
|
||||||
|
PyUnicodeObject *v;
|
||||||
|
Py_UNICODE *p;
|
||||||
|
const char *end;
|
||||||
|
const char *reason;
|
||||||
|
PyObject *errorHandler = NULL;
|
||||||
|
PyObject *exc = NULL;
|
||||||
|
|
||||||
|
unimax = PyUnicode_GetMax();
|
||||||
|
v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
|
||||||
|
if (v == NULL)
|
||||||
|
goto onError;
|
||||||
|
if (PyUnicode_GetSize((PyObject *)v) == 0)
|
||||||
|
return (PyObject *)v;
|
||||||
|
p = PyUnicode_AS_UNICODE(v);
|
||||||
|
end = s + size;
|
||||||
|
|
||||||
|
while (s < end) {
|
||||||
|
*p = *(Py_UNICODE *)s;
|
||||||
|
/* We have to sanity check the raw data, otherwise doom looms for
|
||||||
|
some malformed UCS-4 data. */
|
||||||
|
if (
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
*p > unimax || *p < 0 ||
|
||||||
|
#endif
|
||||||
|
end-s < Py_UNICODE_SIZE
|
||||||
|
)
|
||||||
|
{
|
||||||
|
startinpos = s - starts;
|
||||||
|
if (end-s < Py_UNICODE_SIZE) {
|
||||||
|
endinpos = end-starts;
|
||||||
|
reason = "truncated input";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
endinpos = s - starts + Py_UNICODE_SIZE;
|
||||||
|
reason = "illegal code point (> 0x10FFFF)";
|
||||||
|
}
|
||||||
|
outpos = p - PyUnicode_AS_UNICODE(v);
|
||||||
|
if (unicode_decode_call_errorhandler(
|
||||||
|
errors, &errorHandler,
|
||||||
|
"unicode_internal", reason,
|
||||||
|
starts, size, &startinpos, &endinpos, &exc, &s,
|
||||||
|
(PyObject **)&v, &outpos, &p)) {
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
p++;
|
||||||
|
s += Py_UNICODE_SIZE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
|
||||||
|
goto onError;
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
return (PyObject *)v;
|
||||||
|
|
||||||
|
onError:
|
||||||
|
Py_XDECREF(v);
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* --- Latin-1 Codec ------------------------------------------------------ */
|
/* --- Latin-1 Codec ------------------------------------------------------ */
|
||||||
|
|
||||||
PyObject *PyUnicode_DecodeLatin1(const char *s,
|
PyObject *PyUnicode_DecodeLatin1(const char *s,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue