mirror of
https://github.com/python/cpython.git
synced 2025-10-10 00:43:41 +00:00
Set an upper limit on the size of the field buffer, raise an exception
when this limit is reached. Limit defaults to 128k, and is changed by module set_field_limit() method. Previously, an unmatched quote character could result in the entire file being read into the field buffer, potentially exhausting virtual memory.
This commit is contained in:
parent
29bf4e44f6
commit
e4d05c4f93
3 changed files with 87 additions and 32 deletions
|
@ -6,6 +6,7 @@ csv.py - read/write/investigate CSV files
|
||||||
import re
|
import re
|
||||||
from _csv import Error, __version__, writer, reader, register_dialect, \
|
from _csv import Error, __version__, writer, reader, register_dialect, \
|
||||||
unregister_dialect, get_dialect, list_dialects, \
|
unregister_dialect, get_dialect, list_dialects, \
|
||||||
|
set_field_limit, \
|
||||||
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
|
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
|
||||||
__doc__
|
__doc__
|
||||||
from _csv import Dialect as _Dialect
|
from _csv import Dialect as _Dialect
|
||||||
|
|
|
@ -229,10 +229,17 @@ class Test_Csv(unittest.TestCase):
|
||||||
quoting=csv.QUOTE_NONE, escapechar='\\')
|
quoting=csv.QUOTE_NONE, escapechar='\\')
|
||||||
|
|
||||||
def test_read_bigfield(self):
|
def test_read_bigfield(self):
|
||||||
# This exercises the buffer realloc functionality
|
# This exercises the buffer realloc functionality and field size
|
||||||
bigstring = 'X' * 50000
|
# limits.
|
||||||
|
size = 50000
|
||||||
|
bigstring = 'X' * size
|
||||||
bigline = '%s,%s' % (bigstring, bigstring)
|
bigline = '%s,%s' % (bigstring, bigstring)
|
||||||
self._read_test([bigline], [[bigstring, bigstring]])
|
self._read_test([bigline], [[bigstring, bigstring]])
|
||||||
|
csv.set_field_limit(size)
|
||||||
|
self._read_test([bigline], [[bigstring, bigstring]])
|
||||||
|
self.assertEqual(csv.set_field_limit(), size)
|
||||||
|
csv.set_field_limit(size-1)
|
||||||
|
self.assertRaises(csv.Error, self._read_test, [bigline], [])
|
||||||
|
|
||||||
class TestDialectRegistry(unittest.TestCase):
|
class TestDialectRegistry(unittest.TestCase):
|
||||||
def test_registry_badargs(self):
|
def test_registry_badargs(self):
|
||||||
|
|
107
Modules/_csv.c
107
Modules/_csv.c
|
@ -44,6 +44,7 @@ module instead.
|
||||||
|
|
||||||
static PyObject *error_obj; /* CSV exception */
|
static PyObject *error_obj; /* CSV exception */
|
||||||
static PyObject *dialects; /* Dialect registry */
|
static PyObject *dialects; /* Dialect registry */
|
||||||
|
static long field_limit = 128 * 1024; /* max parsed field size */
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
|
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
|
||||||
|
@ -527,15 +528,21 @@ parse_grow_buff(ReaderObj *self)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static int
|
||||||
parse_add_char(ReaderObj *self, char c)
|
parse_add_char(ReaderObj *self, char c)
|
||||||
{
|
{
|
||||||
|
if (self->field_len >= field_limit) {
|
||||||
|
PyErr_Format(error_obj, "field larger than field limit (%ld)",
|
||||||
|
field_limit);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
if (self->field_len == self->field_size && !parse_grow_buff(self))
|
if (self->field_len == self->field_size && !parse_grow_buff(self))
|
||||||
return;
|
return -1;
|
||||||
self->field[self->field_len++] = c;
|
self->field[self->field_len++] = c;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static int
|
||||||
parse_process_char(ReaderObj *self, char c)
|
parse_process_char(ReaderObj *self, char c)
|
||||||
{
|
{
|
||||||
DialectObj *dialect = self->dialect;
|
DialectObj *dialect = self->dialect;
|
||||||
|
@ -574,13 +581,15 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* begin new unquoted field */
|
/* begin new unquoted field */
|
||||||
parse_add_char(self, c);
|
if (parse_add_char(self, c) < 0)
|
||||||
|
return -1;
|
||||||
self->state = IN_FIELD;
|
self->state = IN_FIELD;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESCAPED_CHAR:
|
case ESCAPED_CHAR:
|
||||||
parse_add_char(self, c);
|
if (parse_add_char(self, c) < 0)
|
||||||
|
return -1;
|
||||||
self->state = IN_FIELD;
|
self->state = IN_FIELD;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -602,7 +611,8 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* normal character - save in field */
|
/* normal character - save in field */
|
||||||
parse_add_char(self, c);
|
if (parse_add_char(self, c) < 0)
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -610,7 +620,8 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
/* in quoted field */
|
/* in quoted field */
|
||||||
if (c == '\n') {
|
if (c == '\n') {
|
||||||
/* end of line - save '\n' in field */
|
/* end of line - save '\n' in field */
|
||||||
parse_add_char(self, '\n');
|
if (parse_add_char(self, '\n') < 0)
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
else if (c == dialect->escapechar) {
|
else if (c == dialect->escapechar) {
|
||||||
/* Possible escape character */
|
/* Possible escape character */
|
||||||
|
@ -629,12 +640,14 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* normal character - save in field */
|
/* normal character - save in field */
|
||||||
parse_add_char(self, c);
|
if (parse_add_char(self, c) < 0)
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESCAPE_IN_QUOTED_FIELD:
|
case ESCAPE_IN_QUOTED_FIELD:
|
||||||
parse_add_char(self, c);
|
if (parse_add_char(self, c) < 0)
|
||||||
|
return -1;
|
||||||
self->state = IN_QUOTED_FIELD;
|
self->state = IN_QUOTED_FIELD;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -643,7 +656,8 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
if (dialect->quoting != QUOTE_NONE &&
|
if (dialect->quoting != QUOTE_NONE &&
|
||||||
c == dialect->quotechar) {
|
c == dialect->quotechar) {
|
||||||
/* save "" as " */
|
/* save "" as " */
|
||||||
parse_add_char(self, c);
|
if (parse_add_char(self, c) < 0)
|
||||||
|
return -1;
|
||||||
self->state = IN_QUOTED_FIELD;
|
self->state = IN_QUOTED_FIELD;
|
||||||
}
|
}
|
||||||
else if (c == dialect->delimiter) {
|
else if (c == dialect->delimiter) {
|
||||||
|
@ -657,7 +671,8 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
self->state = START_RECORD;
|
self->state = START_RECORD;
|
||||||
}
|
}
|
||||||
else if (!dialect->strict) {
|
else if (!dialect->strict) {
|
||||||
parse_add_char(self, c);
|
if (parse_add_char(self, c) < 0)
|
||||||
|
return -1;
|
||||||
self->state = IN_FIELD;
|
self->state = IN_FIELD;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -666,10 +681,12 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
PyErr_Format(error_obj, "%c expected after %c",
|
PyErr_Format(error_obj, "%c expected after %c",
|
||||||
dialect->delimiter,
|
dialect->delimiter,
|
||||||
dialect->quotechar);
|
dialect->quotechar);
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -754,13 +771,15 @@ Reader_iternext(ReaderObj *self)
|
||||||
return PyErr_Format(error_obj,
|
return PyErr_Format(error_obj,
|
||||||
"newline inside string");
|
"newline inside string");
|
||||||
}
|
}
|
||||||
parse_process_char(self, c);
|
if (parse_process_char(self, c) < 0) {
|
||||||
if (PyErr_Occurred()) {
|
Py_DECREF(lineobj);
|
||||||
Py_DECREF(lineobj);
|
return NULL;
|
||||||
return NULL;
|
}
|
||||||
}
|
}
|
||||||
}
|
if (parse_process_char(self, '\n') < 0) {
|
||||||
parse_process_char(self, '\n');
|
Py_DECREF(lineobj);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
Py_DECREF(lineobj);
|
Py_DECREF(lineobj);
|
||||||
} while (self->state != START_RECORD);
|
} while (self->state != START_RECORD);
|
||||||
|
|
||||||
|
@ -1387,6 +1406,25 @@ csv_get_dialect(PyObject *module, PyObject *name_obj)
|
||||||
return get_dialect_from_registry(name_obj);
|
return get_dialect_from_registry(name_obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
csv_set_field_limit(PyObject *module, PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *new_limit = NULL;
|
||||||
|
long old_limit = field_limit;
|
||||||
|
|
||||||
|
if (!PyArg_UnpackTuple(args, "set_field_limit", 0, 1, &new_limit))
|
||||||
|
return NULL;
|
||||||
|
if (new_limit != NULL) {
|
||||||
|
if (!PyInt_Check(new_limit)) {
|
||||||
|
PyErr_Format(PyExc_TypeError,
|
||||||
|
"limit must be an integer");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
field_limit = PyInt_AsLong(new_limit);
|
||||||
|
}
|
||||||
|
return PyInt_FromLong(old_limit);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* MODULE
|
* MODULE
|
||||||
*/
|
*/
|
||||||
|
@ -1494,20 +1532,29 @@ PyDoc_STRVAR(csv_unregister_dialect_doc,
|
||||||
"Delete the name/dialect mapping associated with a string name.\n"
|
"Delete the name/dialect mapping associated with a string name.\n"
|
||||||
" csv.unregister_dialect(name)");
|
" csv.unregister_dialect(name)");
|
||||||
|
|
||||||
|
PyDoc_STRVAR(csv_set_field_limit_doc,
|
||||||
|
"Sets an upper limit on parsed fields.\n"
|
||||||
|
" csv.set_field_limit([limit])\n"
|
||||||
|
"\n"
|
||||||
|
"Returns old limit. If limit is not given, no new limit is set and\n"
|
||||||
|
"the old limit is returned");
|
||||||
|
|
||||||
static struct PyMethodDef csv_methods[] = {
|
static struct PyMethodDef csv_methods[] = {
|
||||||
{ "reader", (PyCFunction)csv_reader,
|
{ "reader", (PyCFunction)csv_reader,
|
||||||
METH_VARARGS | METH_KEYWORDS, csv_reader_doc},
|
METH_VARARGS | METH_KEYWORDS, csv_reader_doc},
|
||||||
{ "writer", (PyCFunction)csv_writer,
|
{ "writer", (PyCFunction)csv_writer,
|
||||||
METH_VARARGS | METH_KEYWORDS, csv_writer_doc},
|
METH_VARARGS | METH_KEYWORDS, csv_writer_doc},
|
||||||
{ "list_dialects", (PyCFunction)csv_list_dialects,
|
{ "list_dialects", (PyCFunction)csv_list_dialects,
|
||||||
METH_NOARGS, csv_list_dialects_doc},
|
METH_NOARGS, csv_list_dialects_doc},
|
||||||
{ "register_dialect", (PyCFunction)csv_register_dialect,
|
{ "register_dialect", (PyCFunction)csv_register_dialect,
|
||||||
METH_VARARGS | METH_KEYWORDS, csv_register_dialect_doc},
|
METH_VARARGS | METH_KEYWORDS, csv_register_dialect_doc},
|
||||||
{ "unregister_dialect", (PyCFunction)csv_unregister_dialect,
|
{ "unregister_dialect", (PyCFunction)csv_unregister_dialect,
|
||||||
METH_O, csv_unregister_dialect_doc},
|
METH_O, csv_unregister_dialect_doc},
|
||||||
{ "get_dialect", (PyCFunction)csv_get_dialect,
|
{ "get_dialect", (PyCFunction)csv_get_dialect,
|
||||||
METH_O, csv_get_dialect_doc},
|
METH_O, csv_get_dialect_doc},
|
||||||
{ NULL, NULL }
|
{ "set_field_limit", (PyCFunction)csv_set_field_limit,
|
||||||
|
METH_VARARGS, csv_set_field_limit_doc},
|
||||||
|
{ NULL, NULL }
|
||||||
};
|
};
|
||||||
|
|
||||||
PyMODINIT_FUNC
|
PyMODINIT_FUNC
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue