#8271: the utf-8 decoder now outputs the correct number of U+FFFD characters when used with the "replace" error handler on invalid utf-8 sequences. Patch by Serhiy Storchaka, tests by Ezio Melotti.

2025-10-28 01:00:34 +00:00 · 2012-11-04 23:21:38 +02:00 · 2012-11-04 23:21:38 +02:00 · f7ed5d111b
commit f7ed5d111b
parent 55b5d5c919
4 changed files with 292 additions and 38 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4759,9 +4759,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
                goto End;
            errmsg = "unexpected end of data";
            startinpos = s - starts;
-            endinpos = startinpos + 1;
-            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
-                endinpos++;
+            endinpos = end - starts;
            break;
        case 1:
            errmsg = "invalid start byte";
@ -4769,11 +4767,11 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
            endinpos = startinpos + 1;
            break;
        case 2:
+        case 3:
+        case 4:
            errmsg = "invalid continuation byte";
            startinpos = s - starts;
-            endinpos = startinpos + 1;
-            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
-                endinpos++;
+            endinpos = startinpos + ch - 1;
            break;
        default:
            if (unicode_putchar(&unicode, &outpos, ch) < 0)