bpo-33578: Add getstate/setstate for CJK codec (GH-6984)

This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell.

The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long.


https://bugs.python.org/issue33578
This commit is contained in:
Christopher Thorne 2018-11-01 10:48:49 +00:00 committed by Miss Islington (bot)
parent 4b5e62dbb2
commit ac22f6aa98
8 changed files with 416 additions and 22 deletions

View file

@ -895,6 +895,93 @@ _multibytecodec_MultibyteIncrementalEncoder_encode_impl(MultibyteIncrementalEnco
return encoder_encode_stateful(STATEFUL_ECTX(self), input, final);
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalEncoder.getstate
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self)
/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/
{
/* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes
for UTF-8 encoded buffer (each character can use up to 4
bytes), and required bytes for MultibyteCodec_State.c. A byte
array is used to avoid different compilers generating different
values for the same state, e.g. as a result of struct padding.
*/
unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
Py_ssize_t statesize;
const char *pendingbuffer = NULL;
Py_ssize_t pendingsize;
if (self->pending != NULL) {
pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize);
if (pendingbuffer == NULL) {
return NULL;
}
if (pendingsize > MAXENCPENDING*4) {
PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
return NULL;
}
statebytes[0] = pendingsize;
memcpy(statebytes+1, pendingbuffer, pendingsize);
statesize = 1 + pendingsize;
} else {
statebytes[0] = 0;
statesize = 1;
}
memcpy(statebytes+statesize, self->state.c,
sizeof(self->state.c));
statesize += sizeof(self->state.c);
return (PyObject *)_PyLong_FromByteArray(statebytes, statesize,
1 /* little-endian */ ,
0 /* unsigned */ );
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalEncoder.setstate
state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type')
/
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
PyLongObject *statelong)
/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/
{
PyObject *pending = NULL;
unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes),
1 /* little-endian */ ,
0 /* unsigned */ ) < 0) {
goto errorexit;
}
if (statebytes[0] > MAXENCPENDING*4) {
PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
return NULL;
}
pending = PyUnicode_DecodeUTF8((const char *)statebytes+1,
statebytes[0], "strict");
if (pending == NULL) {
goto errorexit;
}
Py_CLEAR(self->pending);
self->pending = pending;
memcpy(self->state.c, statebytes+1+statebytes[0],
sizeof(self->state.c));
Py_RETURN_NONE;
errorexit:
Py_XDECREF(pending);
return NULL;
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalEncoder.reset
[clinic start generated code]*/
@ -919,6 +1006,8 @@ _multibytecodec_MultibyteIncrementalEncoder_reset_impl(MultibyteIncrementalEncod
static struct PyMethodDef mbiencoder_methods[] = {
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF
{NULL, NULL},
};
@ -984,6 +1073,7 @@ mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self)
{
PyObject_GC_UnTrack(self);
ERROR_DECREF(self->errors);
Py_CLEAR(self->pending);
Py_TYPE(self)->tp_free(self);
}
@ -1119,6 +1209,68 @@ errorexit:
return NULL;
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalDecoder.getstate
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self)
/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/
{
PyObject *buffer;
buffer = PyBytes_FromStringAndSize((const char *)self->pending,
self->pendingsize);
if (buffer == NULL) {
return NULL;
}
return make_tuple(buffer, (Py_ssize_t)*self->state.c);
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalDecoder.setstate
state: object(subclass_of='&PyTuple_Type')
/
[clinic start generated code]*/
static PyObject *
_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
PyObject *state)
/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/
{
PyObject *buffer;
Py_ssize_t buffersize;
char *bufferstr;
unsigned long long flag;
if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument",
&buffer, &flag))
{
return NULL;
}
buffersize = PyBytes_Size(buffer);
if (buffersize == -1) {
return NULL;
}
if (buffersize > MAXDECPENDING) {
PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
return NULL;
}
bufferstr = PyBytes_AsString(buffer);
if (bufferstr == NULL) {
return NULL;
}
self->pendingsize = buffersize;
memcpy(self->pending, bufferstr, self->pendingsize);
memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag));
Py_RETURN_NONE;
}
/*[clinic input]
_multibytecodec.MultibyteIncrementalDecoder.reset
[clinic start generated code]*/
@ -1137,6 +1289,8 @@ _multibytecodec_MultibyteIncrementalDecoder_reset_impl(MultibyteIncrementalDecod
static struct PyMethodDef mbidecoder_methods[] = {
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF
_MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF
{NULL, NULL},
};