SF bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain

about illegal code points. The codec now supports PEP 293 style error handlers. (This is a variant of the Nik Haldimann's patch that detects truncated data)
2025-09-26 10:19:53 +00:00 · 2005-08-30 10:23:14 +00:00 · 2005-08-30 10:23:14 +00:00 · a47d1c08d0
commit a47d1c08d0
parent 523c9f0709
6 changed files with 173 additions and 5 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -797,6 +797,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
    int length	 		/* Number of Py_UNICODE chars to encode */
    );
 /* --- Unicode Internal Codec ---------------------------------------------
    Only for internal use in _codecsmodule.c */
 PyObject *_PyUnicode_DecodeUnicodeInternal(
    const char *string,
    int length,
    const char *errors
    );
 /* --- Latin-1 Codecs ----------------------------------------------------- 
   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@ -111,7 +111,7 @@ class CodecCallbackTest(unittest.TestCase):
            sout += "\\U%08x" % sys.maxunicode
        self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
-    def test_relaxedutf8(self):
+    def test_decoderelaxedutf8(self):
        # This is the test for a decoding callback handler,
        # that relaxes the UTF-8 minimal encoding restriction.
        # A null byte that is encoded as "\xc0\x80" will be
@ -158,6 +158,35 @@ class CodecCallbackTest(unittest.TestCase):
        charmap[ord("?")] = u"XYZ"
        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
    def test_decodeunicodeinternal(self):
        self.assertRaises(
            UnicodeDecodeError,
            "\x00\x00\x00\x00\x00".decode,
            "unicode-internal",
        )
        if sys.maxunicode > 0xffff:
            def handler_unicodeinternal(exc):
                if not isinstance(exc, UnicodeDecodeError):
                    raise TypeError("don't know how to handle %r" % exc)
                return (u"\x01", 1)
            self.assertEqual(
                "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
                u"\u0000"
            )
            self.assertEqual(
                "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
                u"\u0000\ufffd"
            )
            codecs.register_error("test.hui", handler_unicodeinternal)
            self.assertEqual(
                "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
                u"\u0000\u0001\u0000"
            )
    def test_callbacks(self):
        def handler1(exc):
            if not isinstance(exc, UnicodeEncodeError) \
@ -503,7 +532,8 @@ class CodecCallbackTest(unittest.TestCase):
            for (enc, bytes) in (
                ("ascii", "\xff"),
                ("utf-8", "\xff"),
-                ("utf-7", "+x-")
+                ("utf-7", "+x-"),
                ("unicode-internal", "\x00"),
            ):
                self.assertRaises(
                    TypeError,
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1,7 +1,7 @@
 from test import test_support
 import unittest
 import codecs
-import StringIO
+import sys, StringIO
 class Queue(object):
    """
@ -453,6 +453,54 @@ class PunycodeTest(unittest.TestCase):
        for uni, puny in punycode_testcases:
            self.assertEquals(uni, puny.decode("punycode"))
 class UnicodeInternalTest(unittest.TestCase):
    def test_bug1251300(self):
        # Decoding with unicode_internal used to not correctly handle "code
        # points" above 0x10ffff on UCS-4 builds.
        if sys.maxunicode > 0xffff:
            ok = [
                ("\x00\x10\xff\xff", u"\U0010ffff"),
                ("\x00\x00\x01\x01", u"\U00000101"),
                ("", u""),
            ]
            not_ok = [
                "\x7f\xff\xff\xff",
                "\x80\x00\x00\x00",
                "\x81\x00\x00\x00",
                "\x00",
                "\x00\x00\x00\x00\x00",
            ]
            for internal, uni in ok:
                if sys.byteorder == "little":
                    internal = "".join(reversed(internal))
                self.assertEquals(uni, internal.decode("unicode_internal"))
            for internal in not_ok:
                if sys.byteorder == "little":
                    internal = "".join(reversed(internal))
                self.assertRaises(UnicodeDecodeError, internal.decode,
                    "unicode_internal")
    def test_decode_error_attributes(self):
        if sys.maxunicode > 0xffff:
            try:
                "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
            except UnicodeDecodeError, ex:
                self.assertEquals("unicode_internal", ex.encoding)
                self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
                self.assertEquals(4, ex.start)
                self.assertEquals(8, ex.end)
            else:
                self.fail()
    def test_decode_callback(self):
        if sys.maxunicode > 0xffff:
            codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
            decoder = codecs.getdecoder("unicode_internal")
            ab = u"ab".encode("unicode_internal")
            ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
                "UnicodeInternalTest")
            self.assertEquals((u"ab", 12), ignored)
 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
 nameprep_tests = [
    # 3.1 Map to nothing.
@ -885,6 +933,7 @@ def test_main():
        EscapeDecodeTest,
        RecodingTest,
        PunycodeTest,
        UnicodeInternalTest,
        NameprepTest,
        CodecTest,
        CodecsModuleTest,
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -435,6 +435,10 @@ Library
  line ending. Remove the special handling of a "\r\n" that has been split
  between two lines.
 - Bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
  about illegal code points. The codec now supports PEP 293 style error
  handlers.
 Build
 -----
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@ -254,8 +254,8 @@ unicode_internal_decode(PyObject *self,
    else {
 	if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
 	    return NULL;
-	return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
+
-						 size / sizeof(Py_UNICODE)),
+	return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
 			   size);
    }
 }
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
 					    PyUnicode_GET_SIZE(unicode));
 }
 /* --- Unicode Internal Codec ------------------------------------------- */
 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
 					   int size,
 					   const char *errors)
 {
    const char *starts = s;
    int startinpos;
    int endinpos;
    int outpos;
    Py_UNICODE unimax;
    PyUnicodeObject *v;
    Py_UNICODE *p;
    const char *end;
    const char *reason;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    unimax = PyUnicode_GetMax();
    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
    if (v == NULL)
 	goto onError;
    if (PyUnicode_GetSize((PyObject *)v) == 0)
 	return (PyObject *)v;
    p = PyUnicode_AS_UNICODE(v);
    end = s + size;
    while (s < end) {
        *p = *(Py_UNICODE *)s;
        /* We have to sanity check the raw data, otherwise doom looms for
           some malformed UCS-4 data. */
        if (
            #ifdef Py_UNICODE_WIDE
            *p > unimax || *p < 0 ||
            #endif
            end-s < Py_UNICODE_SIZE
            )
            {
            startinpos = s - starts;
            if (end-s < Py_UNICODE_SIZE) {
                endinpos = end-starts;
                reason = "truncated input";
            }
            else {
                endinpos = s - starts + Py_UNICODE_SIZE;
                reason = "illegal code point (> 0x10FFFF)";
            }
            outpos = p - PyUnicode_AS_UNICODE(v);
            if (unicode_decode_call_errorhandler(
                    errors, &errorHandler,
                    "unicode_internal", reason,
                    starts, size, &startinpos, &endinpos, &exc, &s,
                    (PyObject **)&v, &outpos, &p)) {
                goto onError;
            }
        }
        else {
            p++;
            s += Py_UNICODE_SIZE;
        }
    }
    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
        goto onError;
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return (PyObject *)v;
 onError:
    Py_XDECREF(v);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return NULL;
 }
 /* --- Latin-1 Codec ------------------------------------------------------ */
 PyObject *PyUnicode_DecodeLatin1(const char *s,