Issue #12016: Multibyte CJK decoders now resynchronize faster

They only ignore the first byte of an invalid byte sequence.

For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of
'\ufffd'.
This commit is contained in:
Victor Stinner 2011-07-08 01:45:13 +02:00
parent 081fe46ff9
commit 2cded9c3f3
13 changed files with 159 additions and 93 deletions

View file

@ -85,7 +85,7 @@ DECODER(gb2312)
TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1)
}
else return 2;
else return 1;
}
return 0;
@ -141,7 +141,7 @@ DECODER(gbk)
REQUIRE_INBUF(2)
GBK_DECODE(c, IN2, **outbuf)
else return 2;
else return 1;
NEXT(2, 1)
}
@ -267,7 +267,7 @@ DECODER(gb18030)
c3 = IN3;
c4 = IN4;
if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
return 4;
return 1;
c -= 0x81; c2 -= 0x30;
c3 -= 0x81; c4 -= 0x30;
@ -292,12 +292,12 @@ DECODER(gb18030)
continue;
}
}
return 4;
return 1;
}
GBK_DECODE(c, c2, **outbuf)
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
else return 2;
else return 1;
NEXT(2, 1)
}
@ -400,7 +400,7 @@ DECODER(hz)
else if (c2 == '\n')
; /* line-continuation */
else
return 2;
return 1;
NEXT(2, 0);
continue;
}
@ -419,7 +419,7 @@ DECODER(hz)
NEXT(2, 1)
}
else
return 2;
return 1;
}
}

View file

@ -161,7 +161,7 @@ DECODER(big5hkscs)
case 0x8864: WRITE2(0x00ca, 0x030c); break;
case 0x88a3: WRITE2(0x00ea, 0x0304); break;
case 0x88a5: WRITE2(0x00ea, 0x030c); break;
default: return 2;
default: return 1;
}
NEXT(2, 2) /* all decoded codepoints are pairs, above. */

View file

@ -112,7 +112,7 @@ DECODER(cp932)
TRYMAP_DEC(cp932ext, **outbuf, c, c2);
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 2;
return 1;
c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@ -120,7 +120,7 @@ DECODER(cp932)
c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else return 2;
else return 1;
}
else if (c >= 0xf0 && c <= 0xf9) {
if ((c2 >= 0x40 && c2 <= 0x7e) ||
@ -128,10 +128,10 @@ DECODER(cp932)
OUT1(0xe000 + 188 * (c - 0xf0) +
(c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
else
return 2;
return 1;
}
else
return 2;
return 1;
NEXT(2, 1)
}
@ -256,7 +256,7 @@ DECODER(euc_jis_2004)
NEXT(2, 1)
}
else
return 2;
return 1;
}
else if (c == 0x8f) {
unsigned char c2, c3;
@ -274,7 +274,7 @@ DECODER(euc_jis_2004)
continue;
}
else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
else return 3;
else return 1;
NEXT(3, 1)
}
else {
@ -300,7 +300,7 @@ DECODER(euc_jis_2004)
NEXT(2, 2)
continue;
}
else return 2;
else return 1;
NEXT(2, 1)
}
}
@ -388,7 +388,7 @@ DECODER(euc_jp)
NEXT(2, 1)
}
else
return 2;
return 1;
}
else if (c == 0x8f) {
unsigned char c2, c3;
@ -401,7 +401,7 @@ DECODER(euc_jp)
NEXT(3, 1)
}
else
return 3;
return 1;
}
else {
unsigned char c2;
@ -417,7 +417,7 @@ DECODER(euc_jp)
#endif
TRYMAP_DEC(jisx0208, **outbuf,
c ^ 0x80, c2 ^ 0x80) ;
else return 2;
else return 1;
NEXT(2, 1)
}
}
@ -502,7 +502,7 @@ DECODER(shift_jis)
REQUIRE_INBUF(2)
c2 = IN2;
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 2;
return 1;
c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@ -522,10 +522,10 @@ DECODER(shift_jis)
continue;
}
else
return 2;
return 1;
}
else
return 2;
return 1;
NEXT(1, 1) /* JIS X 0201 */
}
@ -645,7 +645,7 @@ DECODER(shift_jis_2004)
REQUIRE_INBUF(2)
c2 = IN2;
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 2;
return 1;
c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@ -671,7 +671,7 @@ DECODER(shift_jis_2004)
NEXT_OUT(2)
}
else
return 2;
return 1;
NEXT_IN(2)
}
else { /* Plane 2 */
@ -689,13 +689,13 @@ DECODER(shift_jis_2004)
continue;
}
else
return 2;
return 1;
NEXT(2, 1)
}
continue;
}
else
return 2;
return 1;
NEXT(1, 1) /* JIS X 0201 */
}

View file

@ -123,7 +123,7 @@ DECODER(euc_kr)
if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
return 8;
return 1;
c = (*inbuf)[3];
if (0xa1 <= c && c <= 0xbe)
@ -143,7 +143,7 @@ DECODER(euc_kr)
jong = NONE;
if (cho == NONE || jung == NONE || jong == NONE)
return 8;
return 1;
OUT1(0xac00 + cho*588 + jung*28 + jong);
NEXT(8, 1)
@ -152,7 +152,7 @@ DECODER(euc_kr)
NEXT(2, 1)
}
else
return 2;
return 1;
}
return 0;
@ -208,7 +208,7 @@ DECODER(cp949)
REQUIRE_INBUF(2)
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
else return 2;
else return 1;
NEXT(2, 1)
}
@ -375,7 +375,7 @@ DECODER(johab)
i_jong = johabidx_jongseong[c_jong];
if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
return 2;
return 1;
/* we don't use U+1100 hangul jamo yet. */
if (i_cho == FILL) {
@ -391,7 +391,7 @@ DECODER(johab)
OUT1(0x3100 |
johabjamo_jungseong[c_jung])
else
return 2;
return 1;
}
} else {
if (i_jung == FILL) {
@ -399,7 +399,7 @@ DECODER(johab)
OUT1(0x3100 |
johabjamo_choseong[c_cho])
else
return 2;
return 1;
}
else
OUT1(0xac00 +
@ -414,7 +414,7 @@ DECODER(johab)
c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
(c2 & 0x7f) == 0x7f ||
(c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
return 2;
return 1;
else {
unsigned char t1, t2;
@ -425,7 +425,7 @@ DECODER(johab)
t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
else return 2;
else return 1;
NEXT(2, 1)
}
}

View file

@ -55,7 +55,7 @@ DECODER(big5)
TRYMAP_DEC(big5, **outbuf, c, IN2) {
NEXT(2, 1)
}
else return 2;
else return 1;
}
return 0;
@ -109,7 +109,7 @@ DECODER(cp950)
TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
else TRYMAP_DEC(big5, **outbuf, c, IN2);
else return 2;
else return 1;
NEXT(2, 1)
}