bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603)

The bug occurred when the encoded surrogate character is passed to the incremental decoder in two chunks.
2025-11-20 02:50:14 +00:00 · 2019-03-30 08:23:38 +02:00 · 2019-03-30 08:23:38 +02:00 · 7a465cb5ee
commit 7a465cb5ee
parent 38f4e468d4
3 changed files with 14 additions and 0 deletions
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -406,6 +406,15 @@ class ReadTest(MixInCheckStateHandling):
            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                             before + backslashreplace + after)

+    def test_incremental_surrogatepass(self):
+        # Test incremental decoder for surrogatepass handler:
+        # see issue #24214
+        data = '\uD901'.encode(self.encoding, 'surrogatepass')
+        for i in range(1, len(data)):
+            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
+            self.assertEqual(dec.decode(data[:i]), '')
+            self.assertEqual(dec.decode(data[i:], True), '\uD901')
+

 class UTF32Test(ReadTest, unittest.TestCase):
    encoding = "utf-32"