SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support

decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
2025-08-31 14:07:50 +00:00 · 2004-09-07 20:24:22 +00:00 · 2004-09-07 20:24:22 +00:00 · 69652035bc
commit 69652035bc
parent a708d6e3b0
12 changed files with 419 additions and 173 deletions
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -3,7 +3,45 @@ import unittest
 import codecs
 import StringIO

-class UTF16Test(unittest.TestCase):
+class Queue(object):
+    """
+    queue: write bytes at one end, read bytes from the other end
+    """
+    def __init__(self):
+        self._buffer = ""
+
+    def write(self, chars):
+        self._buffer += chars
+
+    def read(self, size=-1):
+        if size<0:
+            s = self._buffer
+            self._buffer = ""
+            return s
+        else:
+            s = self._buffer[:size]
+            self._buffer = self._buffer[size:]
+            return s
+
+class PartialReadTest(unittest.TestCase):
+    def check_partial(self, encoding, input, partialresults):
+        # get a StreamReader for the encoding and feed the bytestring version
+        # of input to the reader byte by byte. Read every available from
+        # the StreamReader and check that the results equal the appropriate
+        # entries from partialresults.
+        q = Queue()
+        r = codecs.getreader(encoding)(q)
+        result = u""
+        for (c, partialresult) in zip(input.encode(encoding), partialresults):
+            q.write(c)
+            result += r.read()
+            self.assertEqual(result, partialresult)
+        # check that there's nothing left in the buffers
+        self.assertEqual(r.read(), u"")
+        self.assertEqual(r.bytebuffer, "")
+        self.assertEqual(r.charbuffer, u"")
+
+class UTF16Test(PartialReadTest):

    spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
    spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
@ -23,6 +61,81 @@ class UTF16Test(unittest.TestCase):
        f = reader(s)
        self.assertEquals(f.read(), u"spamspam")

+    def test_partial(self):
+        self.check_partial(
+            "utf-16",
+            u"\x00\xff\u0100\uffff",
+            [
+                u"", # first byte of BOM read
+                u"", # second byte of BOM read => byteorder known
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+class UTF16LETest(PartialReadTest):
+
+    def test_partial(self):
+        self.check_partial(
+            "utf-16-le",
+            u"\x00\xff\u0100\uffff",
+            [
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+class UTF16BETest(PartialReadTest):
+
+    def test_partial(self):
+        self.check_partial(
+            "utf-16-be",
+            u"\x00\xff\u0100\uffff",
+            [
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+class UTF8Test(PartialReadTest):
+
+    def test_partial(self):
+        self.check_partial(
+            "utf-8",
+            u"\x00\xff\u07ff\u0800\uffff",
+            [
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u07ff",
+                u"\x00\xff\u07ff",
+                u"\x00\xff\u07ff",
+                u"\x00\xff\u07ff\u0800",
+                u"\x00\xff\u07ff\u0800",
+                u"\x00\xff\u07ff\u0800",
+                u"\x00\xff\u07ff\u0800\uffff",
+            ]
+        )
+
 class EscapeDecodeTest(unittest.TestCase):
    def test_empty_escape_decode(self):
        self.assertEquals(codecs.escape_decode(""), ("", 0))
@ -348,6 +461,9 @@ class CodecsModuleTest(unittest.TestCase):
 def test_main():
    test_support.run_unittest(
        UTF16Test,
+        UTF16LETest,
+        UTF16BETest,
+        UTF8Test,
        EscapeDecodeTest,
        RecodingTest,
        PunycodeTest,