bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603)

The bug occurred when the encoded surrogate character is passed to the incremental decoder in two chunks.
2025-09-26 18:29:57 +00:00 · 2019-03-30 08:23:38 +02:00 · 2019-03-30 08:23:38 +02:00 · 7a465cb5ee
commit 7a465cb5ee
parent 38f4e468d4
3 changed files with 14 additions and 0 deletions
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -406,6 +406,15 @@ class ReadTest(MixInCheckStateHandling):
            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                             before + backslashreplace + after)
    def test_incremental_surrogatepass(self):
        # Test incremental decoder for surrogatepass handler:
        # see issue #24214
        data = '\uD901'.encode(self.encoding, 'surrogatepass')
        for i in range(1, len(data)):
            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
            self.assertEqual(dec.decode(data[:i]), '')
            self.assertEqual(dec.decode(data[i:], True), '\uD901')
 class UTF32Test(ReadTest, unittest.TestCase):
    encoding = "utf-32"
--- a/Builtins/2019-03-28-15-22-45.bpo-24214.tZ6lYU.rst
+++ b/Builtins/2019-03-28-15-22-45.bpo-24214.tZ6lYU.rst
@ -0,0 +1,2 @@
 Fixed support of the surrogatepass error handler in the UTF-8 incremental
 decoder.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4883,6 +4883,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
        case 2:
        case 3:
        case 4:
            if (s == end || consumed) {
                goto End;
            }
            errmsg = "invalid continuation byte";
            startinpos = s - starts;
            endinpos = startinpos + ch - 1;
		`@ -0,0 +1,2 @@`
							`Fixed support of the surrogatepass error handler in the UTF-8 incremental`
							`decoder.`