bpo-33578: Add getstate/setstate for CJK codec (GH-6984)

This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell.

The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long.


https://bugs.python.org/issue33578
This commit is contained in:
Christopher Thorne 2018-11-01 10:48:49 +00:00 committed by Miss Islington (bot)
parent 4b5e62dbb2
commit ac22f6aa98
8 changed files with 416 additions and 22 deletions

View file

@ -51,6 +51,12 @@
; \
}
/*
* codecs in this file use the first byte of MultibyteCodec_State.c[8]
* to store a 0 or 1 state value
*/
#define CN_STATE_OFFSET 0
/*
* GB2312 codec
*/
@ -329,15 +335,15 @@ DECODER(gb18030)
ENCODER_INIT(hz)
{
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
return 0;
}
ENCODER_RESET(hz)
{
if (state->i != 0) {
if (state->c[CN_STATE_OFFSET] != 0) {
WRITEBYTE2('~', '}');
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
NEXT_OUT(2);
}
return 0;
@ -350,10 +356,10 @@ ENCODER(hz)
DBCHAR code;
if (c < 0x80) {
if (state->i) {
if (state->c[CN_STATE_OFFSET]) {
WRITEBYTE2('~', '}');
NEXT_OUT(2);
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
}
WRITEBYTE1((unsigned char)c);
NEXT(1, 1);
@ -375,10 +381,10 @@ ENCODER(hz)
if (code & 0x8000) /* MSB set: GBK */
return 1;
if (state->i == 0) {
if (state->c[CN_STATE_OFFSET] == 0) {
WRITEBYTE4('~', '{', code >> 8, code & 0xff);
NEXT(1, 4);
state->i = 1;
state->c[CN_STATE_OFFSET] = 1;
}
else {
WRITEBYTE2(code >> 8, code & 0xff);
@ -391,13 +397,13 @@ ENCODER(hz)
DECODER_INIT(hz)
{
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
return 0;
}
DECODER_RESET(hz)
{
state->i = 0;
state->c[CN_STATE_OFFSET] = 0;
return 0;
}
@ -411,14 +417,14 @@ DECODER(hz)
unsigned char c2 = INBYTE2;
REQUIRE_INBUF(2);
if (c2 == '~' && state->i == 0)
if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
OUTCHAR('~');
else if (c2 == '{' && state->i == 0)
state->i = 1; /* set GB */
else if (c2 == '\n' && state->i == 0)
else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
state->c[CN_STATE_OFFSET] = 1; /* set GB */
else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
; /* line-continuation */
else if (c2 == '}' && state->i == 1)
state->i = 0; /* set ASCII */
else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
else
return 1;
NEXT_IN(2);
@ -428,7 +434,7 @@ DECODER(hz)
if (c & 0x80)
return 1;
if (state->i == 0) { /* ASCII mode */
if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
OUTCHAR(c);
NEXT_IN(1);
}