Moved reader \r and \n processing from the iterator to the state machine -

this allows for better handling of newline characters in quoted fields (and hopefully resolves Bug 967934).
2025-09-02 15:07:53 +00:00 · 2005-01-13 11:30:54 +00:00 · 2005-01-13 11:30:54 +00:00 · f69d94f6c0
commit f69d94f6c0
parent a1974c1459
2 changed files with 78 additions and 99 deletions
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -48,10 +48,11 @@ Library
    dictates.
  + the parser now removes the escapechar prefix from escaped characters.
  + when quoting=QUOTE_NONNUMERIC, the writer now tests for numeric
-    objects, rather than attempting to cast to float, and using the
+    types, rather than any object than can be represented as a numeric.
    success of that as the determinator.
  + when quoting=QUOTE_NONNUMERIC, the reader now casts unquoted fields
    to floats.
  + reader now allows \r characters to be quoted (previously it only allowed
    \n to be quoted).
  + writer doublequote handling improved.
  + Dialect classes passed to the module are no longer instantiated by
    the module before being parsed (the former validation scheme required
--- a/Modules/_csv.c
+++ b/Modules/_csv.c
@ -48,7 +48,8 @@ static long field_limit = 128 * 1024;	/* max parsed field size */
 typedef enum {
 	START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, 
-	IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD
+	IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
 	EAT_CRNL
 } ParserState;
 typedef enum {
@ -96,7 +97,6 @@ typedef struct {
 	char *field;		/* build current field in here */
 	int field_size;		/* size of allocated buffer */
 	int field_len;		/* length of current field */
 	int had_parse_error;	/* did we have a parse error? */
 	int numeric_field;	/* treat field as numeric */
 	unsigned long line_num;	/* Source-file line number */
 } ReaderObj;
@ -497,6 +497,9 @@ _call_dialect(PyObject *dialect_inst, PyObject *kwargs)
 	return dialect;
 }
 /*
 * READER
 */
 static int
 parse_save_field(ReaderObj *self)
 {
@ -543,22 +546,6 @@ parse_grow_buff(ReaderObj *self)
 	return 1;
 }
 static int
 parse_reset(ReaderObj *self)
 {
 	if (self->fields) {
 		Py_DECREF(self->fields);
 	}
 	self->fields = PyList_New(0);
 	if (self->fields == NULL)
 		return -1;
 	self->field_len = 0;
 	self->state = START_RECORD;
 	self->had_parse_error = 0;
 	self->numeric_field = 0;
 	return 0;
 }
 static int
 parse_add_char(ReaderObj *self, char c)
 {
@ -581,19 +568,23 @@ parse_process_char(ReaderObj *self, char c)
 	switch (self->state) {
 	case START_RECORD:
 		/* start of record */
-		if (c == '\n')
+		if (c == '\0')
 			/* empty line - return [] */
 			break;
 		else if (c == '\n' || c == '\r') {
 			self->state = EAT_CRNL;
 			break;
 		}
 		/* normal character - handle as START_FIELD */
 		self->state = START_FIELD;
 		/* fallthru */
 	case START_FIELD:
 		/* expecting field */
-		if (c == '\n') {
+		if (c == '\n' || c == '\r' || c == '\0') {
 			/* save empty field - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (c == dialect->quotechar && 
 			 dialect->quoting != QUOTE_NONE) {
@ -623,6 +614,8 @@ parse_process_char(ReaderObj *self, char c)
 		break;
 	case ESCAPED_CHAR:
 		if (c == '\0')
 			c = '\n';
 		if (parse_add_char(self, c) < 0)
 			return -1;
 		self->state = IN_FIELD;
@ -630,11 +623,11 @@ parse_process_char(ReaderObj *self, char c)
 	case IN_FIELD:
 		/* in unquoted field */
-		if (c == '\n') {
+		if (c == '\n' || c == '\r' || c == '\0') {
 			/* end of line - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (c == dialect->escapechar) {
 			/* possible escaped character */
@ -655,11 +648,8 @@ parse_process_char(ReaderObj *self, char c)
 	case IN_QUOTED_FIELD:
 		/* in quoted field */
-		if (c == '\n') {
+		if (c == '\0')
-			/* end of line - save '\n' in field */
+			;
 			if (parse_add_char(self, '\n') < 0)
 				return -1;
 		}
 		else if (c == dialect->escapechar) {
 			/* Possible escape character */
 			self->state = ESCAPE_IN_QUOTED_FIELD;
@ -683,6 +673,8 @@ parse_process_char(ReaderObj *self, char c)
 		break;
 	case ESCAPE_IN_QUOTED_FIELD:
 		if (c == '\0')
 			c = '\n';
 		if (parse_add_char(self, c) < 0)
 			return -1;
 		self->state = IN_QUOTED_FIELD;
@ -703,11 +695,11 @@ parse_process_char(ReaderObj *self, char c)
 				return -1;
 			self->state = START_FIELD;
 		}
-		else if (c == '\n') {
+		else if (c == '\n' || c == '\r' || c == '\0') {
 			/* end of line - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (!dialect->strict) {
 			if (parse_add_char(self, c) < 0)
@ -716,7 +708,6 @@ parse_process_char(ReaderObj *self, char c)
 		}
 		else {
 			/* illegal */
 			self->had_parse_error = 1;
 			PyErr_Format(error_obj, "'%c' expected after '%c'", 
 					dialect->delimiter, 
                                        dialect->quotechar);
@ -724,104 +715,83 @@ parse_process_char(ReaderObj *self, char c)
 		}
 		break;
 	case EAT_CRNL:
 		if (c == '\n' || c == '\r')
 			;
 		else if (c == '\0')
 			self->state = START_RECORD;
 		else {
 			PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
 			return -1;
 		}
 		break;
 	}
 	return 0;
 }
-/*
+static int
- * READER
+parse_reset(ReaderObj *self)
- */
+{
-#define R_OFF(x) offsetof(ReaderObj, x)
+	Py_XDECREF(self->fields);
-
+	self->fields = PyList_New(0);
-static struct PyMemberDef Reader_memberlist[] = {
+	if (self->fields == NULL)
-	{ "dialect", T_OBJECT, R_OFF(dialect), RO },
+		return -1;
-	{ "line_num", T_ULONG, R_OFF(line_num), RO },
+	self->field_len = 0;
-	{ NULL }
+	self->state = START_RECORD;
-};
+	self->numeric_field = 0;
 	return 0;
 }
 static PyObject *
 Reader_iternext(ReaderObj *self)
 {
        PyObject *lineobj;
-        PyObject *fields;
+        PyObject *fields = NULL;
-        char *line;
+        char *line, c;
 	int linelen;
 	if (parse_reset(self) < 0)
 		return NULL;
        do {
                lineobj = PyIter_Next(self->input_iter);
                if (lineobj == NULL) {
                        /* End of input OR exception */
                        if (!PyErr_Occurred() && self->field_len != 0)
-                                return PyErr_Format(error_obj,
+                                PyErr_Format(error_obj,
-                                                    "newline inside string");
+					     "newline inside string");
                        return NULL;
                }
 		++self->line_num;
                if (self->had_parse_error)
 			if (parse_reset(self) < 0) {
 				Py_DECREF(lineobj);
 				return NULL;
 			}
                line = PyString_AsString(lineobj);
 		linelen = PyString_Size(lineobj);
-                if (line == NULL) {
+                if (line == NULL || linelen < 0) {
                        Py_DECREF(lineobj);
                        return NULL;
                }
-		if (strlen(line) < (size_t)PyString_GET_SIZE(lineobj)) {
+                while (linelen--) {
-			self->had_parse_error = 1;
+			c = *line++;
-			Py_DECREF(lineobj);
+			if (c == '\0') {
-			return PyErr_Format(error_obj,
+				Py_DECREF(lineobj);
-					    "string with NUL bytes");
+				PyErr_Format(error_obj,
-		}
+					     "line contains NULL byte");
-
+				goto err;
-                /* Process line of text - send '\n' to processing code to
+			}
                represent end of line.  End of line which is not at end of
                string is an error. */
                while (*line) {
                        char c;
                        c = *line++;
                        if (c == '\r') {
                                c = *line++;
                                if (c == '\0')
                                        /* macintosh end of line */
                                        break;
                                if (c == '\n') {
                                        c = *line++;
                                        if (c == '\0')
                                                /* DOS end of line */
                                                break;
                                }
                                self->had_parse_error = 1;
                                Py_DECREF(lineobj);
                                return PyErr_Format(error_obj,
                                                    "newline inside string");
                        }
                        if (c == '\n') {
                                c = *line++;
                                if (c == '\0')
                                        /* unix end of line */
                                        break;
                                self->had_parse_error = 1;
                                Py_DECREF(lineobj);
                                return PyErr_Format(error_obj, 
                                                    "newline inside string");
                        }
 			if (parse_process_char(self, c) < 0) {
 				Py_DECREF(lineobj);
-				return NULL;
+				goto err;
 			}
 		}
 		if (parse_process_char(self, '\n') < 0) {
 			Py_DECREF(lineobj);
 			return NULL;
 		}
                Py_DECREF(lineobj);
 		if (parse_process_char(self, 0) < 0)
 			goto err;
        } while (self->state != START_RECORD);
        fields = self->fields;
-        self->fields = PyList_New(0);
+        self->fields = NULL;
 err:
        return fields;
 }
@ -875,6 +845,14 @@ PyDoc_STRVAR(Reader_Type_doc,
 static struct PyMethodDef Reader_methods[] = {
 	{ NULL, NULL }
 };
 #define R_OFF(x) offsetof(ReaderObj, x)
 static struct PyMemberDef Reader_memberlist[] = {
 	{ "dialect", T_OBJECT, R_OFF(dialect), RO },
 	{ "line_num", T_ULONG, R_OFF(line_num), RO },
 	{ NULL }
 };
 static PyTypeObject Reader_Type = {
 	PyObject_HEAD_INIT(NULL)