mirror of
https://github.com/python/cpython.git
synced 2025-09-02 15:07:53 +00:00
Moved reader \r and \n processing from the iterator to the state machine -
this allows for better handling of newline characters in quoted fields (and hopefully resolves Bug 967934).
This commit is contained in:
parent
a1974c1459
commit
f69d94f6c0
2 changed files with 78 additions and 99 deletions
|
@ -48,10 +48,11 @@ Library
|
||||||
dictates.
|
dictates.
|
||||||
+ the parser now removes the escapechar prefix from escaped characters.
|
+ the parser now removes the escapechar prefix from escaped characters.
|
||||||
+ when quoting=QUOTE_NONNUMERIC, the writer now tests for numeric
|
+ when quoting=QUOTE_NONNUMERIC, the writer now tests for numeric
|
||||||
objects, rather than attempting to cast to float, and using the
|
types, rather than any object than can be represented as a numeric.
|
||||||
success of that as the determinator.
|
|
||||||
+ when quoting=QUOTE_NONNUMERIC, the reader now casts unquoted fields
|
+ when quoting=QUOTE_NONNUMERIC, the reader now casts unquoted fields
|
||||||
to floats.
|
to floats.
|
||||||
|
+ reader now allows \r characters to be quoted (previously it only allowed
|
||||||
|
\n to be quoted).
|
||||||
+ writer doublequote handling improved.
|
+ writer doublequote handling improved.
|
||||||
+ Dialect classes passed to the module are no longer instantiated by
|
+ Dialect classes passed to the module are no longer instantiated by
|
||||||
the module before being parsed (the former validation scheme required
|
the module before being parsed (the former validation scheme required
|
||||||
|
|
172
Modules/_csv.c
172
Modules/_csv.c
|
@ -48,7 +48,8 @@ static long field_limit = 128 * 1024; /* max parsed field size */
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
|
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
|
||||||
IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD
|
IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
|
||||||
|
EAT_CRNL
|
||||||
} ParserState;
|
} ParserState;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
|
@ -96,7 +97,6 @@ typedef struct {
|
||||||
char *field; /* build current field in here */
|
char *field; /* build current field in here */
|
||||||
int field_size; /* size of allocated buffer */
|
int field_size; /* size of allocated buffer */
|
||||||
int field_len; /* length of current field */
|
int field_len; /* length of current field */
|
||||||
int had_parse_error; /* did we have a parse error? */
|
|
||||||
int numeric_field; /* treat field as numeric */
|
int numeric_field; /* treat field as numeric */
|
||||||
unsigned long line_num; /* Source-file line number */
|
unsigned long line_num; /* Source-file line number */
|
||||||
} ReaderObj;
|
} ReaderObj;
|
||||||
|
@ -497,6 +497,9 @@ _call_dialect(PyObject *dialect_inst, PyObject *kwargs)
|
||||||
return dialect;
|
return dialect;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* READER
|
||||||
|
*/
|
||||||
static int
|
static int
|
||||||
parse_save_field(ReaderObj *self)
|
parse_save_field(ReaderObj *self)
|
||||||
{
|
{
|
||||||
|
@ -543,22 +546,6 @@ parse_grow_buff(ReaderObj *self)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
|
||||||
parse_reset(ReaderObj *self)
|
|
||||||
{
|
|
||||||
if (self->fields) {
|
|
||||||
Py_DECREF(self->fields);
|
|
||||||
}
|
|
||||||
self->fields = PyList_New(0);
|
|
||||||
if (self->fields == NULL)
|
|
||||||
return -1;
|
|
||||||
self->field_len = 0;
|
|
||||||
self->state = START_RECORD;
|
|
||||||
self->had_parse_error = 0;
|
|
||||||
self->numeric_field = 0;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
parse_add_char(ReaderObj *self, char c)
|
parse_add_char(ReaderObj *self, char c)
|
||||||
{
|
{
|
||||||
|
@ -581,19 +568,23 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
switch (self->state) {
|
switch (self->state) {
|
||||||
case START_RECORD:
|
case START_RECORD:
|
||||||
/* start of record */
|
/* start of record */
|
||||||
if (c == '\n')
|
if (c == '\0')
|
||||||
/* empty line - return [] */
|
/* empty line - return [] */
|
||||||
break;
|
break;
|
||||||
|
else if (c == '\n' || c == '\r') {
|
||||||
|
self->state = EAT_CRNL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
/* normal character - handle as START_FIELD */
|
/* normal character - handle as START_FIELD */
|
||||||
self->state = START_FIELD;
|
self->state = START_FIELD;
|
||||||
/* fallthru */
|
/* fallthru */
|
||||||
case START_FIELD:
|
case START_FIELD:
|
||||||
/* expecting field */
|
/* expecting field */
|
||||||
if (c == '\n') {
|
if (c == '\n' || c == '\r' || c == '\0') {
|
||||||
/* save empty field - return [fields] */
|
/* save empty field - return [fields] */
|
||||||
if (parse_save_field(self) < 0)
|
if (parse_save_field(self) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
self->state = START_RECORD;
|
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
|
||||||
}
|
}
|
||||||
else if (c == dialect->quotechar &&
|
else if (c == dialect->quotechar &&
|
||||||
dialect->quoting != QUOTE_NONE) {
|
dialect->quoting != QUOTE_NONE) {
|
||||||
|
@ -623,6 +614,8 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESCAPED_CHAR:
|
case ESCAPED_CHAR:
|
||||||
|
if (c == '\0')
|
||||||
|
c = '\n';
|
||||||
if (parse_add_char(self, c) < 0)
|
if (parse_add_char(self, c) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
self->state = IN_FIELD;
|
self->state = IN_FIELD;
|
||||||
|
@ -630,11 +623,11 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
|
|
||||||
case IN_FIELD:
|
case IN_FIELD:
|
||||||
/* in unquoted field */
|
/* in unquoted field */
|
||||||
if (c == '\n') {
|
if (c == '\n' || c == '\r' || c == '\0') {
|
||||||
/* end of line - return [fields] */
|
/* end of line - return [fields] */
|
||||||
if (parse_save_field(self) < 0)
|
if (parse_save_field(self) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
self->state = START_RECORD;
|
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
|
||||||
}
|
}
|
||||||
else if (c == dialect->escapechar) {
|
else if (c == dialect->escapechar) {
|
||||||
/* possible escaped character */
|
/* possible escaped character */
|
||||||
|
@ -655,11 +648,8 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
|
|
||||||
case IN_QUOTED_FIELD:
|
case IN_QUOTED_FIELD:
|
||||||
/* in quoted field */
|
/* in quoted field */
|
||||||
if (c == '\n') {
|
if (c == '\0')
|
||||||
/* end of line - save '\n' in field */
|
;
|
||||||
if (parse_add_char(self, '\n') < 0)
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
else if (c == dialect->escapechar) {
|
else if (c == dialect->escapechar) {
|
||||||
/* Possible escape character */
|
/* Possible escape character */
|
||||||
self->state = ESCAPE_IN_QUOTED_FIELD;
|
self->state = ESCAPE_IN_QUOTED_FIELD;
|
||||||
|
@ -683,6 +673,8 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESCAPE_IN_QUOTED_FIELD:
|
case ESCAPE_IN_QUOTED_FIELD:
|
||||||
|
if (c == '\0')
|
||||||
|
c = '\n';
|
||||||
if (parse_add_char(self, c) < 0)
|
if (parse_add_char(self, c) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
self->state = IN_QUOTED_FIELD;
|
self->state = IN_QUOTED_FIELD;
|
||||||
|
@ -703,11 +695,11 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
return -1;
|
return -1;
|
||||||
self->state = START_FIELD;
|
self->state = START_FIELD;
|
||||||
}
|
}
|
||||||
else if (c == '\n') {
|
else if (c == '\n' || c == '\r' || c == '\0') {
|
||||||
/* end of line - return [fields] */
|
/* end of line - return [fields] */
|
||||||
if (parse_save_field(self) < 0)
|
if (parse_save_field(self) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
self->state = START_RECORD;
|
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
|
||||||
}
|
}
|
||||||
else if (!dialect->strict) {
|
else if (!dialect->strict) {
|
||||||
if (parse_add_char(self, c) < 0)
|
if (parse_add_char(self, c) < 0)
|
||||||
|
@ -716,7 +708,6 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* illegal */
|
/* illegal */
|
||||||
self->had_parse_error = 1;
|
|
||||||
PyErr_Format(error_obj, "'%c' expected after '%c'",
|
PyErr_Format(error_obj, "'%c' expected after '%c'",
|
||||||
dialect->delimiter,
|
dialect->delimiter,
|
||||||
dialect->quotechar);
|
dialect->quotechar);
|
||||||
|
@ -724,104 +715,83 @@ parse_process_char(ReaderObj *self, char c)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case EAT_CRNL:
|
||||||
|
if (c == '\n' || c == '\r')
|
||||||
|
;
|
||||||
|
else if (c == '\0')
|
||||||
|
self->state = START_RECORD;
|
||||||
|
else {
|
||||||
|
PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static int
|
||||||
* READER
|
parse_reset(ReaderObj *self)
|
||||||
*/
|
{
|
||||||
#define R_OFF(x) offsetof(ReaderObj, x)
|
Py_XDECREF(self->fields);
|
||||||
|
self->fields = PyList_New(0);
|
||||||
static struct PyMemberDef Reader_memberlist[] = {
|
if (self->fields == NULL)
|
||||||
{ "dialect", T_OBJECT, R_OFF(dialect), RO },
|
return -1;
|
||||||
{ "line_num", T_ULONG, R_OFF(line_num), RO },
|
self->field_len = 0;
|
||||||
{ NULL }
|
self->state = START_RECORD;
|
||||||
};
|
self->numeric_field = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
Reader_iternext(ReaderObj *self)
|
Reader_iternext(ReaderObj *self)
|
||||||
{
|
{
|
||||||
PyObject *lineobj;
|
PyObject *lineobj;
|
||||||
PyObject *fields;
|
PyObject *fields = NULL;
|
||||||
char *line;
|
char *line, c;
|
||||||
|
int linelen;
|
||||||
|
|
||||||
|
if (parse_reset(self) < 0)
|
||||||
|
return NULL;
|
||||||
do {
|
do {
|
||||||
lineobj = PyIter_Next(self->input_iter);
|
lineobj = PyIter_Next(self->input_iter);
|
||||||
if (lineobj == NULL) {
|
if (lineobj == NULL) {
|
||||||
/* End of input OR exception */
|
/* End of input OR exception */
|
||||||
if (!PyErr_Occurred() && self->field_len != 0)
|
if (!PyErr_Occurred() && self->field_len != 0)
|
||||||
return PyErr_Format(error_obj,
|
PyErr_Format(error_obj,
|
||||||
"newline inside string");
|
"newline inside string");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
++self->line_num;
|
++self->line_num;
|
||||||
|
|
||||||
if (self->had_parse_error)
|
|
||||||
if (parse_reset(self) < 0) {
|
|
||||||
Py_DECREF(lineobj);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
line = PyString_AsString(lineobj);
|
line = PyString_AsString(lineobj);
|
||||||
|
linelen = PyString_Size(lineobj);
|
||||||
|
|
||||||
if (line == NULL) {
|
if (line == NULL || linelen < 0) {
|
||||||
Py_DECREF(lineobj);
|
Py_DECREF(lineobj);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (strlen(line) < (size_t)PyString_GET_SIZE(lineobj)) {
|
while (linelen--) {
|
||||||
self->had_parse_error = 1;
|
c = *line++;
|
||||||
Py_DECREF(lineobj);
|
if (c == '\0') {
|
||||||
return PyErr_Format(error_obj,
|
Py_DECREF(lineobj);
|
||||||
"string with NUL bytes");
|
PyErr_Format(error_obj,
|
||||||
}
|
"line contains NULL byte");
|
||||||
|
goto err;
|
||||||
/* Process line of text - send '\n' to processing code to
|
}
|
||||||
represent end of line. End of line which is not at end of
|
|
||||||
string is an error. */
|
|
||||||
while (*line) {
|
|
||||||
char c;
|
|
||||||
|
|
||||||
c = *line++;
|
|
||||||
if (c == '\r') {
|
|
||||||
c = *line++;
|
|
||||||
if (c == '\0')
|
|
||||||
/* macintosh end of line */
|
|
||||||
break;
|
|
||||||
if (c == '\n') {
|
|
||||||
c = *line++;
|
|
||||||
if (c == '\0')
|
|
||||||
/* DOS end of line */
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
self->had_parse_error = 1;
|
|
||||||
Py_DECREF(lineobj);
|
|
||||||
return PyErr_Format(error_obj,
|
|
||||||
"newline inside string");
|
|
||||||
}
|
|
||||||
if (c == '\n') {
|
|
||||||
c = *line++;
|
|
||||||
if (c == '\0')
|
|
||||||
/* unix end of line */
|
|
||||||
break;
|
|
||||||
self->had_parse_error = 1;
|
|
||||||
Py_DECREF(lineobj);
|
|
||||||
return PyErr_Format(error_obj,
|
|
||||||
"newline inside string");
|
|
||||||
}
|
|
||||||
if (parse_process_char(self, c) < 0) {
|
if (parse_process_char(self, c) < 0) {
|
||||||
Py_DECREF(lineobj);
|
Py_DECREF(lineobj);
|
||||||
return NULL;
|
goto err;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (parse_process_char(self, '\n') < 0) {
|
|
||||||
Py_DECREF(lineobj);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
Py_DECREF(lineobj);
|
Py_DECREF(lineobj);
|
||||||
|
if (parse_process_char(self, 0) < 0)
|
||||||
|
goto err;
|
||||||
} while (self->state != START_RECORD);
|
} while (self->state != START_RECORD);
|
||||||
|
|
||||||
fields = self->fields;
|
fields = self->fields;
|
||||||
self->fields = PyList_New(0);
|
self->fields = NULL;
|
||||||
|
err:
|
||||||
return fields;
|
return fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -875,6 +845,14 @@ PyDoc_STRVAR(Reader_Type_doc,
|
||||||
static struct PyMethodDef Reader_methods[] = {
|
static struct PyMethodDef Reader_methods[] = {
|
||||||
{ NULL, NULL }
|
{ NULL, NULL }
|
||||||
};
|
};
|
||||||
|
#define R_OFF(x) offsetof(ReaderObj, x)
|
||||||
|
|
||||||
|
static struct PyMemberDef Reader_memberlist[] = {
|
||||||
|
{ "dialect", T_OBJECT, R_OFF(dialect), RO },
|
||||||
|
{ "line_num", T_ULONG, R_OFF(line_num), RO },
|
||||||
|
{ NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
static PyTypeObject Reader_Type = {
|
static PyTypeObject Reader_Type = {
|
||||||
PyObject_HEAD_INIT(NULL)
|
PyObject_HEAD_INIT(NULL)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue