mirror of
https://github.com/python/cpython.git
synced 2025-07-24 11:44:31 +00:00
#8271: the utf-8 decoder now outputs the correct number of U+FFFD characters when used with the "replace" error handler on invalid utf-8 sequences. Patch by Serhiy Storchaka, tests by Ezio Melotti.
This commit is contained in:
parent
55b5d5c919
commit
f7ed5d111b
4 changed files with 292 additions and 38 deletions
|
@ -4759,9 +4759,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
goto End;
|
||||
errmsg = "unexpected end of data";
|
||||
startinpos = s - starts;
|
||||
endinpos = startinpos + 1;
|
||||
while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
|
||||
endinpos++;
|
||||
endinpos = end - starts;
|
||||
break;
|
||||
case 1:
|
||||
errmsg = "invalid start byte";
|
||||
|
@ -4769,11 +4767,11 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
endinpos = startinpos + 1;
|
||||
break;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
errmsg = "invalid continuation byte";
|
||||
startinpos = s - starts;
|
||||
endinpos = startinpos + 1;
|
||||
while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
|
||||
endinpos++;
|
||||
endinpos = startinpos + ch - 1;
|
||||
break;
|
||||
default:
|
||||
if (unicode_putchar(&unicode, &outpos, ch) < 0)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue