Issue #16475: Support object instancing, recursion and interned strings

in marshal
This commit is contained in:
Kristján Valur Jónsson 2013-03-19 18:02:10 -07:00
parent c7c42efb16
commit d7009c6913
5 changed files with 376 additions and 32 deletions

View file

@ -1,8 +1,10 @@
/* Write Python objects to files and read them back.
This is intended for writing and reading compiled Python code only;
a true persistent storage facility would be much harder, since
it would have to take circular links and sharing into account. */
This is primarily intended for writing and reading compiled Python code,
even though dicts, lists, sets and frozensets, not commonly seen in
code objects, are supported.
Version 3 of this protocol properly supports circular links
and sharing. */
#define PY_SSIZE_T_CLEAN
@ -41,6 +43,8 @@
#define TYPE_BINARY_COMPLEX 'y'
#define TYPE_LONG 'l'
#define TYPE_STRING 's'
#define TYPE_INTERNED 't'
#define TYPE_REF 'r'
#define TYPE_TUPLE '('
#define TYPE_LIST '['
#define TYPE_DICT '{'
@ -49,6 +53,7 @@
#define TYPE_UNKNOWN '?'
#define TYPE_SET '<'
#define TYPE_FROZENSET '>'
#define FLAG_REF '\x80' /* with a type, add obj to index */
#define WFERR_OK 0
#define WFERR_UNMARSHALLABLE 1
@ -65,6 +70,7 @@ typedef struct {
PyObject *current_filename;
char *ptr;
char *end;
PyObject *refs; /* dict on marshal, list on unmarshal */
int version;
} WFILE;
@ -151,13 +157,17 @@ w_long(long x, WFILE *p)
#endif
#define PyLong_MARSHAL_RATIO (PyLong_SHIFT / PyLong_MARSHAL_SHIFT)
#define W_TYPE(t, p) do { \
w_byte((t) | flag, (p)); \
} while(0)
static void
w_PyLong(const PyLongObject *ob, WFILE *p)
w_PyLong(const PyLongObject *ob, char flag, WFILE *p)
{
Py_ssize_t i, j, n, l;
digit d;
w_byte(TYPE_LONG, p);
W_TYPE(TYPE_LONG, p);
if (Py_SIZE(ob) == 0) {
w_long((long)0, p);
return;
@ -194,10 +204,64 @@ w_PyLong(const PyLongObject *ob, WFILE *p)
} while (d != 0);
}
static int
w_ref(PyObject *v, char *flag, WFILE *p)
{
PyObject *id;
PyObject *idx;
if (p->version < 3 || p->refs == NULL)
return 0; /* not writing object references */
/* if it has only one reference, it definitely isn't shared */
if (Py_REFCNT(v) == 1)
return 0;
id = PyLong_FromVoidPtr((void*)v);
if (id == NULL)
goto err;
idx = PyDict_GetItem(p->refs, id);
if (idx != NULL) {
/* write the reference index to the stream */
long w = PyLong_AsLong(idx);
Py_DECREF(id);
if (w == -1 && PyErr_Occurred()) {
goto err;
}
/* we don't store "long" indices in the dict */
assert(0 <= w && w <= 0x7fffffff);
w_byte(TYPE_REF, p);
w_long(w, p);
return 1;
} else {
int ok;
Py_ssize_t s = PyDict_Size(p->refs);
/* we don't support long indices */
if (s >= 0x7fffffff) {
PyErr_SetString(PyExc_ValueError, "too many objects");
goto err;
}
idx = PyLong_FromSsize_t(s);
ok = idx && PyDict_SetItem(p->refs, id, idx) == 0;
Py_DECREF(id);
Py_XDECREF(idx);
if (!ok)
goto err;
*flag |= FLAG_REF;
return 0;
}
err:
p->error = WFERR_UNMARSHALLABLE;
return 1;
}
static void
w_complex_object(PyObject *v, char flag, WFILE *p);
static void
w_object(PyObject *v, WFILE *p)
{
Py_ssize_t i, n;
char flag = '\0';
p->depth++;
@ -222,24 +286,35 @@ w_object(PyObject *v, WFILE *p)
else if (v == Py_True) {
w_byte(TYPE_TRUE, p);
}
else if (PyLong_CheckExact(v)) {
else if (!w_ref(v, &flag, p))
w_complex_object(v, flag, p);
p->depth--;
}
static void
w_complex_object(PyObject *v, char flag, WFILE *p)
{
Py_ssize_t i, n;
if (PyLong_CheckExact(v)) {
long x = PyLong_AsLong(v);
if ((x == -1) && PyErr_Occurred()) {
PyLongObject *ob = (PyLongObject *)v;
PyErr_Clear();
w_PyLong(ob, p);
w_PyLong(ob, flag, p);
}
else {
#if SIZEOF_LONG > 4
long y = Py_ARITHMETIC_RIGHT_SHIFT(long, x, 31);
if (y && y != -1) {
/* Too large for TYPE_INT */
w_PyLong((PyLongObject*)v, p);
w_PyLong((PyLongObject*)v, flag, p);
}
else
#endif
{
w_byte(TYPE_INT, p);
W_TYPE(TYPE_INT, p);
w_long(x, p);
}
}
@ -252,7 +327,7 @@ w_object(PyObject *v, WFILE *p)
p->error = WFERR_UNMARSHALLABLE;
return;
}
w_byte(TYPE_BINARY_FLOAT, p);
W_TYPE(TYPE_BINARY_FLOAT, p);
w_string((char*)buf, 8, p);
}
else {
@ -263,7 +338,7 @@ w_object(PyObject *v, WFILE *p)
return;
}
n = strlen(buf);
w_byte(TYPE_FLOAT, p);
W_TYPE(TYPE_FLOAT, p);
w_byte((int)n, p);
w_string(buf, n, p);
PyMem_Free(buf);
@ -277,7 +352,7 @@ w_object(PyObject *v, WFILE *p)
p->error = WFERR_UNMARSHALLABLE;
return;
}
w_byte(TYPE_BINARY_COMPLEX, p);
W_TYPE(TYPE_BINARY_COMPLEX, p);
w_string((char*)buf, 8, p);
if (_PyFloat_Pack8(PyComplex_ImagAsDouble(v),
buf, 1) < 0) {
@ -288,7 +363,7 @@ w_object(PyObject *v, WFILE *p)
}
else {
char *buf;
w_byte(TYPE_COMPLEX, p);
W_TYPE(TYPE_COMPLEX, p);
buf = PyOS_double_to_string(PyComplex_RealAsDouble(v),
'g', 17, 0, NULL);
if (!buf) {
@ -312,7 +387,7 @@ w_object(PyObject *v, WFILE *p)
}
}
else if (PyBytes_CheckExact(v)) {
w_byte(TYPE_STRING, p);
W_TYPE(TYPE_STRING, p);
n = PyBytes_GET_SIZE(v);
W_SIZE(n, p);
w_string(PyBytes_AS_STRING(v), n, p);
@ -325,14 +400,17 @@ w_object(PyObject *v, WFILE *p)
p->error = WFERR_UNMARSHALLABLE;
return;
}
w_byte(TYPE_UNICODE, p);
if (p->version >= 3 && PyUnicode_CHECK_INTERNED(v))
W_TYPE(TYPE_INTERNED, p);
else
W_TYPE(TYPE_UNICODE, p);
n = PyBytes_GET_SIZE(utf8);
W_SIZE(n, p);
w_string(PyBytes_AS_STRING(utf8), n, p);
Py_DECREF(utf8);
}
else if (PyTuple_CheckExact(v)) {
w_byte(TYPE_TUPLE, p);
W_TYPE(TYPE_TUPLE, p);
n = PyTuple_Size(v);
W_SIZE(n, p);
for (i = 0; i < n; i++) {
@ -340,7 +418,7 @@ w_object(PyObject *v, WFILE *p)
}
}
else if (PyList_CheckExact(v)) {
w_byte(TYPE_LIST, p);
W_TYPE(TYPE_LIST, p);
n = PyList_GET_SIZE(v);
W_SIZE(n, p);
for (i = 0; i < n; i++) {
@ -350,7 +428,7 @@ w_object(PyObject *v, WFILE *p)
else if (PyDict_CheckExact(v)) {
Py_ssize_t pos;
PyObject *key, *value;
w_byte(TYPE_DICT, p);
W_TYPE(TYPE_DICT, p);
/* This one is NULL object terminated! */
pos = 0;
while (PyDict_Next(v, &pos, &key, &value)) {
@ -363,9 +441,9 @@ w_object(PyObject *v, WFILE *p)
PyObject *value, *it;
if (PyObject_TypeCheck(v, &PySet_Type))
w_byte(TYPE_SET, p);
W_TYPE(TYPE_SET, p);
else
w_byte(TYPE_FROZENSET, p);
W_TYPE(TYPE_FROZENSET, p);
n = PyObject_Size(v);
if (n == -1) {
p->depth--;
@ -392,7 +470,7 @@ w_object(PyObject *v, WFILE *p)
}
else if (PyCode_Check(v)) {
PyCodeObject *co = (PyCodeObject *)v;
w_byte(TYPE_CODE, p);
W_TYPE(TYPE_CODE, p);
w_long(co->co_argcount, p);
w_long(co->co_kwonlyargcount, p);
w_long(co->co_nlocals, p);
@ -419,7 +497,7 @@ w_object(PyObject *v, WFILE *p)
p->error = WFERR_UNMARSHALLABLE;
return;
}
w_byte(TYPE_STRING, p);
W_TYPE(TYPE_STRING, p);
n = view.len;
s = view.buf;
W_SIZE(n, p);
@ -427,10 +505,9 @@ w_object(PyObject *v, WFILE *p)
PyBuffer_Release(&view);
}
else {
w_byte(TYPE_UNKNOWN, p);
W_TYPE(TYPE_UNKNOWN, p);
p->error = WFERR_UNMARSHALLABLE;
}
p->depth--;
}
/* version currently has no effect for writing longs. */
@ -441,6 +518,7 @@ PyMarshal_WriteLongToFile(long x, FILE *fp, int version)
wf.fp = fp;
wf.error = WFERR_OK;
wf.depth = 0;
wf.refs = NULL;
wf.version = version;
w_long(x, &wf);
}
@ -452,8 +530,14 @@ PyMarshal_WriteObjectToFile(PyObject *x, FILE *fp, int version)
wf.fp = fp;
wf.error = WFERR_OK;
wf.depth = 0;
if (version >= 3) {
if ((wf.refs = PyDict_New()) == NULL)
return; /* caller mush check PyErr_Occurred() */
} else
wf.refs = NULL;
wf.version = version;
w_object(x, &wf);
Py_XDECREF(wf.refs);
}
typedef WFILE RFILE; /* Same struct with different invariants */
@ -489,7 +573,7 @@ r_string(char *s, Py_ssize_t n, RFILE *p)
data->ob_type->tp_name);
}
else {
read = PyBytes_GET_SIZE(data);
read = (int)PyBytes_GET_SIZE(data);
if (read > 0) {
ptr = PyBytes_AS_STRING(data);
memcpy(s, ptr, read);
@ -659,6 +743,59 @@ r_PyLong(RFILE *p)
return NULL;
}
/* allocate the reflist index */
static PyObject *
r_ref_reserve(PyObject *o, Py_ssize_t *idx, int flag, RFILE *p)
{
if (flag) { /* currently only FLAG_REF is defined */
*idx = PyList_Size(p->refs);
if (*idx < 0)
goto err;
if (*idx >= 0x7ffffffe) {
PyErr_SetString(PyExc_ValueError, "bad marshal data (index list too large)");
goto err;
}
if (PyList_Append(p->refs, Py_None) < 0)
goto err;
} else
*idx = 0;
return o;
err:
Py_XDECREF(o); /* release the new object */
*idx = -1;
return NULL;
}
/* insert actual object to the reflist */
static PyObject *
r_ref_insert(PyObject *o, Py_ssize_t idx, int flag, RFILE *p)
{
if (o && (flag & FLAG_REF)) {
if (PyList_SetItem(p->refs, idx, o) < 0) {
Py_DECREF(o); /* release the new object */
return NULL;
} else {
Py_INCREF(o); /* a reference for the list */
}
}
return o;
}
/* combination of both above, used when an object can be
* created whenever it is seen in the file, as opposed to
* after having loaded its sub-objects.
*/
static PyObject *
r_ref(PyObject *o, int flag, RFILE *p)
{
if (o && (flag & FLAG_REF)) {
if (PyList_Append(p->refs, o) < 0) {
Py_DECREF(o); /* release the new object */
return NULL;
}
}
return o;
}
static PyObject *
r_object(RFILE *p)
@ -666,8 +803,10 @@ r_object(RFILE *p)
/* NULL is a valid return value, it does not necessarily means that
an exception is set. */
PyObject *v, *v2;
Py_ssize_t idx;
long i, n;
int type = r_byte(p);
int flag;
PyObject *retval;
p->depth++;
@ -678,6 +817,13 @@ r_object(RFILE *p)
return NULL;
}
flag = type & FLAG_REF;
type = type & ~FLAG_REF;
#define R_REF(O) do{\
O = r_ref(O, flag, p);\
} while (0)
switch (type) {
case EOF:
@ -718,14 +864,17 @@ r_object(RFILE *p)
case TYPE_INT:
n = r_long(p);
retval = PyErr_Occurred() ? NULL : PyLong_FromLong(n);
R_REF(retval);
break;
case TYPE_INT64:
retval = r_long64(p);
R_REF(retval);
break;
case TYPE_LONG:
retval = r_PyLong(p);
R_REF(retval);
break;
case TYPE_FLOAT:
@ -744,6 +893,7 @@ r_object(RFILE *p)
if (dx == -1.0 && PyErr_Occurred())
break;
retval = PyFloat_FromDouble(dx);
R_REF(retval);
break;
}
@ -763,6 +913,7 @@ r_object(RFILE *p)
break;
}
retval = PyFloat_FromDouble(x);
R_REF(retval);
break;
}
@ -792,6 +943,7 @@ r_object(RFILE *p)
if (c.imag == -1.0 && PyErr_Occurred())
break;
retval = PyComplex_FromCComplex(c);
R_REF(retval);
break;
}
@ -822,6 +974,7 @@ r_object(RFILE *p)
break;
}
retval = PyComplex_FromCComplex(c);
R_REF(retval);
break;
}
@ -849,9 +1002,11 @@ r_object(RFILE *p)
break;
}
retval = v;
R_REF(retval);
break;
case TYPE_UNICODE:
case TYPE_INTERNED:
{
char *buffer;
@ -879,7 +1034,10 @@ r_object(RFILE *p)
}
v = PyUnicode_DecodeUTF8(buffer, n, "surrogatepass");
PyMem_DEL(buffer);
if (type == TYPE_INTERNED)
PyUnicode_InternInPlace(&v);
retval = v;
R_REF(retval);
break;
}
@ -895,6 +1053,7 @@ r_object(RFILE *p)
break;
}
v = PyTuple_New(n);
R_REF(v);
if (v == NULL) {
retval = NULL;
break;
@ -926,6 +1085,7 @@ r_object(RFILE *p)
break;
}
v = PyList_New(n);
R_REF(v);
if (v == NULL) {
retval = NULL;
break;
@ -947,6 +1107,7 @@ r_object(RFILE *p)
case TYPE_DICT:
v = PyDict_New();
R_REF(v);
if (v == NULL) {
retval = NULL;
break;
@ -982,6 +1143,13 @@ r_object(RFILE *p)
break;
}
v = (type == TYPE_SET) ? PySet_New(NULL) : PyFrozenSet_New(NULL);
/* must use delayed registration of frozensets because they must
* be init with a refcount of 1
*/
if (type == TYPE_SET)
R_REF(v);
else
v = r_ref_reserve(v, &idx, flag, p);
if (v == NULL) {
retval = NULL;
break;
@ -1004,6 +1172,8 @@ r_object(RFILE *p)
}
Py_DECREF(v2);
}
if (type != TYPE_SET)
v = r_ref_insert(v, idx, flag, p);
retval = v;
break;
@ -1024,6 +1194,12 @@ r_object(RFILE *p)
PyObject *name = NULL;
int firstlineno;
PyObject *lnotab = NULL;
r_ref_reserve(NULL, &idx, flag, p);
if (idx < 0) {
retval = NULL;
break;
}
v = NULL;
@ -1090,6 +1266,7 @@ r_object(RFILE *p)
code, consts, names, varnames,
freevars, cellvars, filename, name,
firstlineno, lnotab);
v = r_ref_insert(v, idx, flag, p);
code_error:
Py_XDECREF(code);
@ -1105,6 +1282,23 @@ r_object(RFILE *p)
retval = v;
break;
case TYPE_REF:
n = r_long(p);
if (n < 0 || n >= PyList_GET_SIZE(p->refs)) {
PyErr_SetString(PyExc_ValueError, "bad marshal data (invalid reference)");
retval = NULL;
break;
}
v = PyList_GET_ITEM(p->refs, n);
if (v == Py_None) {
PyErr_SetString(PyExc_ValueError, "bad marshal data (invalid reference)");
retval = NULL;
break;
}
Py_INCREF(v);
retval = v;
break;
default:
/* Bogus data got written, which isn't ideal.
This will let you keep working and recover. */
@ -1210,7 +1404,11 @@ PyMarshal_ReadObjectFromFile(FILE *fp)
rf.current_filename = NULL;
rf.depth = 0;
rf.ptr = rf.end = NULL;
rf.refs = PyList_New(0);
if (rf.refs == NULL)
return NULL;
result = r_object(&rf);
Py_DECREF(rf.refs);
return result;
}
@ -1225,7 +1423,11 @@ PyMarshal_ReadObjectFromString(char *str, Py_ssize_t len)
rf.ptr = str;
rf.end = str + len;
rf.depth = 0;
rf.refs = PyList_New(0);
if (rf.refs == NULL)
return NULL;
result = r_object(&rf);
Py_DECREF(rf.refs);
return result;
}
@ -1244,7 +1446,13 @@ PyMarshal_WriteObjectToString(PyObject *x, int version)
wf.error = WFERR_OK;
wf.depth = 0;
wf.version = version;
if (version >= 3) {
if ((wf.refs = PyDict_New()) == NULL)
return NULL;
} else
wf.refs = NULL;
w_object(x, &wf);
Py_XDECREF(wf.refs);
if (wf.str != NULL) {
char *base = PyBytes_AS_STRING((PyBytesObject *)wf.str);
if (wf.ptr - base > PY_SSIZE_T_MAX) {
@ -1316,6 +1524,8 @@ marshal_load(PyObject *self, PyObject *f)
* Make a call to the read method, but read zero bytes.
* This is to ensure that the object passed in at least
* has a read method which returns bytes.
* This can be removed if we guarantee good error handling
* for r_string()
*/
data = _PyObject_CallMethodId(f, &PyId_read, "i", 0);
if (data == NULL)
@ -1331,7 +1541,11 @@ marshal_load(PyObject *self, PyObject *f)
rf.fp = NULL;
rf.readable = f;
rf.current_filename = NULL;
result = read_object(&rf);
if ((rf.refs = PyList_New(0)) != NULL) {
result = read_object(&rf);
Py_DECREF(rf.refs);
} else
result = NULL;
}
Py_DECREF(data);
return result;
@ -1388,8 +1602,11 @@ marshal_loads(PyObject *self, PyObject *args)
rf.ptr = s;
rf.end = s + n;
rf.depth = 0;
if ((rf.refs = PyList_New(0)) == NULL)
return NULL;
result = read_object(&rf);
PyBuffer_Release(&p);
Py_DECREF(rf.refs);
return result;
}
@ -1429,6 +1646,7 @@ Variables:\n\
version -- indicates the format that the module uses. Version 0 is the\n\
historical format, version 1 shares interned strings and version 2\n\
uses a binary format for floating point numbers.\n\
Version 3 shares common object references (New in version 3.4).\n\
\n\
Functions:\n\
\n\