Set an upper limit on the size of the field buffer, raise an exception

when this limit is reached. Limit defaults to 128k, and is changed
by module set_field_limit() method. Previously, an unmatched quote
character could result in the entire file being read into the field
buffer, potentially exhausting virtual memory.
This commit is contained in:
Andrew McNamara 2005-01-11 07:32:02 +00:00
parent 29bf4e44f6
commit e4d05c4f93
3 changed files with 87 additions and 32 deletions

View file

@ -6,6 +6,7 @@ csv.py - read/write/investigate CSV files
import re import re
from _csv import Error, __version__, writer, reader, register_dialect, \ from _csv import Error, __version__, writer, reader, register_dialect, \
unregister_dialect, get_dialect, list_dialects, \ unregister_dialect, get_dialect, list_dialects, \
set_field_limit, \
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
__doc__ __doc__
from _csv import Dialect as _Dialect from _csv import Dialect as _Dialect

View file

@ -229,10 +229,17 @@ class Test_Csv(unittest.TestCase):
quoting=csv.QUOTE_NONE, escapechar='\\') quoting=csv.QUOTE_NONE, escapechar='\\')
def test_read_bigfield(self): def test_read_bigfield(self):
# This exercises the buffer realloc functionality # This exercises the buffer realloc functionality and field size
bigstring = 'X' * 50000 # limits.
size = 50000
bigstring = 'X' * size
bigline = '%s,%s' % (bigstring, bigstring) bigline = '%s,%s' % (bigstring, bigstring)
self._read_test([bigline], [[bigstring, bigstring]]) self._read_test([bigline], [[bigstring, bigstring]])
csv.set_field_limit(size)
self._read_test([bigline], [[bigstring, bigstring]])
self.assertEqual(csv.set_field_limit(), size)
csv.set_field_limit(size-1)
self.assertRaises(csv.Error, self._read_test, [bigline], [])
class TestDialectRegistry(unittest.TestCase): class TestDialectRegistry(unittest.TestCase):
def test_registry_badargs(self): def test_registry_badargs(self):

View file

@ -44,6 +44,7 @@ module instead.
static PyObject *error_obj; /* CSV exception */ static PyObject *error_obj; /* CSV exception */
static PyObject *dialects; /* Dialect registry */ static PyObject *dialects; /* Dialect registry */
static long field_limit = 128 * 1024; /* max parsed field size */
typedef enum { typedef enum {
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
@ -527,15 +528,21 @@ parse_grow_buff(ReaderObj *self)
return 1; return 1;
} }
static void static int
parse_add_char(ReaderObj *self, char c) parse_add_char(ReaderObj *self, char c)
{ {
if (self->field_len >= field_limit) {
PyErr_Format(error_obj, "field larger than field limit (%ld)",
field_limit);
return -1;
}
if (self->field_len == self->field_size && !parse_grow_buff(self)) if (self->field_len == self->field_size && !parse_grow_buff(self))
return; return -1;
self->field[self->field_len++] = c; self->field[self->field_len++] = c;
return 0;
} }
static void static int
parse_process_char(ReaderObj *self, char c) parse_process_char(ReaderObj *self, char c)
{ {
DialectObj *dialect = self->dialect; DialectObj *dialect = self->dialect;
@ -574,13 +581,15 @@ parse_process_char(ReaderObj *self, char c)
} }
else { else {
/* begin new unquoted field */ /* begin new unquoted field */
parse_add_char(self, c); if (parse_add_char(self, c) < 0)
return -1;
self->state = IN_FIELD; self->state = IN_FIELD;
} }
break; break;
case ESCAPED_CHAR: case ESCAPED_CHAR:
parse_add_char(self, c); if (parse_add_char(self, c) < 0)
return -1;
self->state = IN_FIELD; self->state = IN_FIELD;
break; break;
@ -602,7 +611,8 @@ parse_process_char(ReaderObj *self, char c)
} }
else { else {
/* normal character - save in field */ /* normal character - save in field */
parse_add_char(self, c); if (parse_add_char(self, c) < 0)
return -1;
} }
break; break;
@ -610,7 +620,8 @@ parse_process_char(ReaderObj *self, char c)
/* in quoted field */ /* in quoted field */
if (c == '\n') { if (c == '\n') {
/* end of line - save '\n' in field */ /* end of line - save '\n' in field */
parse_add_char(self, '\n'); if (parse_add_char(self, '\n') < 0)
return -1;
} }
else if (c == dialect->escapechar) { else if (c == dialect->escapechar) {
/* Possible escape character */ /* Possible escape character */
@ -629,12 +640,14 @@ parse_process_char(ReaderObj *self, char c)
} }
else { else {
/* normal character - save in field */ /* normal character - save in field */
parse_add_char(self, c); if (parse_add_char(self, c) < 0)
return -1;
} }
break; break;
case ESCAPE_IN_QUOTED_FIELD: case ESCAPE_IN_QUOTED_FIELD:
parse_add_char(self, c); if (parse_add_char(self, c) < 0)
return -1;
self->state = IN_QUOTED_FIELD; self->state = IN_QUOTED_FIELD;
break; break;
@ -643,7 +656,8 @@ parse_process_char(ReaderObj *self, char c)
if (dialect->quoting != QUOTE_NONE && if (dialect->quoting != QUOTE_NONE &&
c == dialect->quotechar) { c == dialect->quotechar) {
/* save "" as " */ /* save "" as " */
parse_add_char(self, c); if (parse_add_char(self, c) < 0)
return -1;
self->state = IN_QUOTED_FIELD; self->state = IN_QUOTED_FIELD;
} }
else if (c == dialect->delimiter) { else if (c == dialect->delimiter) {
@ -657,7 +671,8 @@ parse_process_char(ReaderObj *self, char c)
self->state = START_RECORD; self->state = START_RECORD;
} }
else if (!dialect->strict) { else if (!dialect->strict) {
parse_add_char(self, c); if (parse_add_char(self, c) < 0)
return -1;
self->state = IN_FIELD; self->state = IN_FIELD;
} }
else { else {
@ -666,10 +681,12 @@ parse_process_char(ReaderObj *self, char c)
PyErr_Format(error_obj, "%c expected after %c", PyErr_Format(error_obj, "%c expected after %c",
dialect->delimiter, dialect->delimiter,
dialect->quotechar); dialect->quotechar);
return -1;
} }
break; break;
} }
return 0;
} }
/* /*
@ -754,13 +771,15 @@ Reader_iternext(ReaderObj *self)
return PyErr_Format(error_obj, return PyErr_Format(error_obj,
"newline inside string"); "newline inside string");
} }
parse_process_char(self, c); if (parse_process_char(self, c) < 0) {
if (PyErr_Occurred()) { Py_DECREF(lineobj);
Py_DECREF(lineobj); return NULL;
return NULL; }
} }
} if (parse_process_char(self, '\n') < 0) {
parse_process_char(self, '\n'); Py_DECREF(lineobj);
return NULL;
}
Py_DECREF(lineobj); Py_DECREF(lineobj);
} while (self->state != START_RECORD); } while (self->state != START_RECORD);
@ -1387,6 +1406,25 @@ csv_get_dialect(PyObject *module, PyObject *name_obj)
return get_dialect_from_registry(name_obj); return get_dialect_from_registry(name_obj);
} }
static PyObject *
csv_set_field_limit(PyObject *module, PyObject *args)
{
PyObject *new_limit = NULL;
long old_limit = field_limit;
if (!PyArg_UnpackTuple(args, "set_field_limit", 0, 1, &new_limit))
return NULL;
if (new_limit != NULL) {
if (!PyInt_Check(new_limit)) {
PyErr_Format(PyExc_TypeError,
"limit must be an integer");
return NULL;
}
field_limit = PyInt_AsLong(new_limit);
}
return PyInt_FromLong(old_limit);
}
/* /*
* MODULE * MODULE
*/ */
@ -1494,20 +1532,29 @@ PyDoc_STRVAR(csv_unregister_dialect_doc,
"Delete the name/dialect mapping associated with a string name.\n" "Delete the name/dialect mapping associated with a string name.\n"
" csv.unregister_dialect(name)"); " csv.unregister_dialect(name)");
PyDoc_STRVAR(csv_set_field_limit_doc,
"Sets an upper limit on parsed fields.\n"
" csv.set_field_limit([limit])\n"
"\n"
"Returns old limit. If limit is not given, no new limit is set and\n"
"the old limit is returned");
static struct PyMethodDef csv_methods[] = { static struct PyMethodDef csv_methods[] = {
{ "reader", (PyCFunction)csv_reader, { "reader", (PyCFunction)csv_reader,
METH_VARARGS | METH_KEYWORDS, csv_reader_doc}, METH_VARARGS | METH_KEYWORDS, csv_reader_doc},
{ "writer", (PyCFunction)csv_writer, { "writer", (PyCFunction)csv_writer,
METH_VARARGS | METH_KEYWORDS, csv_writer_doc}, METH_VARARGS | METH_KEYWORDS, csv_writer_doc},
{ "list_dialects", (PyCFunction)csv_list_dialects, { "list_dialects", (PyCFunction)csv_list_dialects,
METH_NOARGS, csv_list_dialects_doc}, METH_NOARGS, csv_list_dialects_doc},
{ "register_dialect", (PyCFunction)csv_register_dialect, { "register_dialect", (PyCFunction)csv_register_dialect,
METH_VARARGS | METH_KEYWORDS, csv_register_dialect_doc}, METH_VARARGS | METH_KEYWORDS, csv_register_dialect_doc},
{ "unregister_dialect", (PyCFunction)csv_unregister_dialect, { "unregister_dialect", (PyCFunction)csv_unregister_dialect,
METH_O, csv_unregister_dialect_doc}, METH_O, csv_unregister_dialect_doc},
{ "get_dialect", (PyCFunction)csv_get_dialect, { "get_dialect", (PyCFunction)csv_get_dialect,
METH_O, csv_get_dialect_doc}, METH_O, csv_get_dialect_doc},
{ NULL, NULL } { "set_field_limit", (PyCFunction)csv_set_field_limit,
METH_VARARGS, csv_set_field_limit_doc},
{ NULL, NULL }
}; };
PyMODINIT_FUNC PyMODINIT_FUNC