fix several compile() issues by translating newlines in the tokenizer

2025-11-24 20:30:18 +00:00 · 2009-11-12 23:39:44 +00:00 · 2009-11-12 23:39:44 +00:00 · e36199b49d
commit e36199b49d
parent c4cd6d3765
8 changed files with 96 additions and 30 deletions
--- a/Doc/library/functions.rst
+++ b/Doc/library/functions.rst
@ -173,11 +173,10 @@ available.  They are listed here in alphabetical order.
   .. note::
-      When compiling a string with multi-line statements, line endings must be
+      When compiling a string with multi-line statements in ``'single'`` or
-      represented by a single newline character (``'\n'``), and the input must
+      ``'eval'`` mode, input must be terminated by at least one newline
-      be terminated by at least one newline character.  If line endings are
+      character.  This is to facilitate detection of incomplete and complete
-      represented by ``'\r\n'``, use :meth:`str.replace` to change them into
+      statements in the :mod:`code` module.
      ``'\n'``.
   .. versionchanged:: 2.3
      The *flags* and *dont_inherit* arguments were added.
@ -185,6 +184,10 @@ available.  They are listed here in alphabetical order.
   .. versionchanged:: 2.6
      Support for compiling AST objects.
   .. versionchanged:: 2.7
      Allowed use of Windows and Mac newlines.  Also input in ``'exec'`` mode
      does not have to end in a newline anymore.
 .. function:: complex([real[, imag]])
--- a/Lib/test/test_codeop.py
+++ b/Lib/test/test_codeop.py
@ -295,10 +295,6 @@ class CodeopTests(unittest.TestCase):
        self.assertNotEquals(compile_command("a = 1\n", "abc").co_filename,
                             compile("a = 1\n", "def", 'single').co_filename)
    def test_no_universal_newlines(self):
        code = compile_command("'\rfoo\r'", symbol='eval')
        self.assertEqual(eval(code), '\rfoo\r')
 def test_main():
    run_unittest(CodeopTests)
--- a/Lib/test/test_compile.py
+++ b/Lib/test/test_compile.py
@ -5,6 +5,19 @@ from test import test_support
 class TestSpecifics(unittest.TestCase):
    def test_no_ending_newline(self):
        compile("hi", "<test>", "exec")
        compile("hi\r", "<test>", "exec")
    def test_empty(self):
        compile("", "<test>", "exec")
    def test_other_newlines(self):
        compile("\r\n", "<test>", "exec")
        compile("\r", "<test>", "exec")
        compile("hi\r\nstuff\r\ndef f():\n    pass\r", "<test>", "exec")
        compile("this_is\rreally_old_mac\rdef f():\n    pass", "<test>", "exec")
    def test_debug_assignment(self):
        # catch assignments to __debug__
        self.assertRaises(SyntaxError, compile, '__debug__ = 1', '?', 'single')
--- a/Lib/test/test_parser.py
+++ b/Lib/test/test_parser.py
@ -243,9 +243,9 @@ class RoundtripLegalSyntaxTestCase(unittest.TestCase):
            (14, '+', 2, 13),
            (2, '1', 2, 15),
            (4, '', 2, 16),
-            (6, '', 2, -1),
+            (6, '', 3, -1),
-            (4, '', 2, -1),
+            (4, '', 3, -1),
-            (0, '', 2, -1)],
+            (0, '', 3, -1)],
                         terminals)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,9 @@ What's New in Python 2.7 alpha 1
 Core and Builtins
 -----------------
 - Fix several issues with compile().  The input can now contain Windows and Mac
  newlines and is no longer required to end in a newline.
 - Remove length limitation when constructing a complex number from a
  unicode string.
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@ -51,7 +51,7 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
 	initerr(err_ret, filename);
-	if ((tok = PyTokenizer_FromString(s)) == NULL) {
+	if ((tok = PyTokenizer_FromString(s, start == file_input)) == NULL) {
 		err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM;
 		return NULL;
 	}
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -105,6 +105,7 @@ tok_new(void)
 	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 	tok->done = E_OK;
 	tok->fp = NULL;
 	tok->input = NULL;
 	tok->tabsize = TABSIZE;
 	tok->indent = 0;
 	tok->indstack[0] = 0;
@ -130,6 +131,17 @@ tok_new(void)
 	return tok;
 }
 static char *
 new_string(const char *s, Py_ssize_t len)
 {
 	char* result = (char *)PyMem_MALLOC(len + 1);
 	if (result != NULL) {
 		memcpy(result, s, len);
 		result[len] = '\0';
 	}
 	return result;
 }
 #ifdef PGEN
 static char *
@ -144,10 +156,10 @@ decoding_feof(struct tok_state *tok)
 	return feof(tok->fp);
 }
-static const char *
+static char *
-decode_str(const char *str, struct tok_state *tok)
+decode_str(const char *str, int exec_input, struct tok_state *tok)
 {
-	return str;
+	return new_string(str, strlen(str));
 }
 #else /* PGEN */
@ -162,16 +174,6 @@ error_ret(struct tok_state *tok) /* XXX */
 	return NULL;		/* as if it were EOF */
 }
 static char *
 new_string(const char *s, Py_ssize_t len)
 {
 	char* result = (char *)PyMem_MALLOC(len + 1);
 	if (result != NULL) {
 		memcpy(result, s, len);
 		result[len] = '\0';
 	}
 	return result;
 }
 static char *
 get_normal_name(char *s)	/* for utf-8 and latin-1 */
@ -586,17 +588,63 @@ translate_into_utf8(const char* str, const char* enc) {
 }
 #endif
 static char *
 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
 	int skip_next_lf = 0, length = strlen(s), final_length;
 	char *buf, *current;
 	char c;
 	buf = PyMem_MALLOC(length + 2);
 	if (buf == NULL) {
 		tok->done = E_NOMEM;
 		return NULL;
 	}
 	for (current = buf; (c = *s++);) {
 		if (skip_next_lf) {
 			skip_next_lf = 0;
 			if (c == '\n') {
 				c = *s;
 				s++;
 				if (!c)
 					break;
 			}
 		}
 		if (c == '\r') {
 			skip_next_lf = 1;
 			c = '\n';
 		}
 		*current = c;
 		current++;
 	}
 	/* If this is exec input, add a newline to the end of the file if
 	   there isn't one already. */
 	if (exec_input && *current != '\n') {
 		*current = '\n';
 		current++;
 	}
 	*current = '\0';
 	final_length = current - buf;
 	if (final_length < length && final_length)
 		/* should never fail */
 		buf = PyMem_REALLOC(buf, final_length + 1);
 	return buf;
 }
 /* Decode a byte string STR for use as the buffer of TOK.
   Look for encoding declarations inside STR, and record them
   inside TOK.  */
 static const char *
-decode_str(const char *str, struct tok_state *tok)
+decode_str(const char *input, int single, struct tok_state *tok)
 {
 	PyObject* utf8 = NULL;
 	const char *str;
 	const char *s;
 	const char *newl[2] = {NULL, NULL};
 	int lineno = 0;
 	tok->input = str = translate_newlines(input, single, tok);
 	if (str == NULL)
 		return NULL;
 	tok->enc = NULL;
 	tok->str = str;
 	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
@ -651,12 +699,12 @@ decode_str(const char *str, struct tok_state *tok)
 /* Set up tokenizer for string */
 struct tok_state *
-PyTokenizer_FromString(const char *str)
+PyTokenizer_FromString(const char *str, int exec_input)
 {
 	struct tok_state *tok = tok_new();
 	if (tok == NULL)
 		return NULL;
-	str = (char *)decode_str(str, tok);
+	str = (char *)decode_str(str, exec_input, tok);
 	if (str == NULL) {
 		PyTokenizer_Free(tok);
 		return NULL;
@ -702,6 +750,8 @@ PyTokenizer_Free(struct tok_state *tok)
 #endif
 	if (tok->fp != NULL && tok->buf != NULL)
 		PyMem_FREE(tok->buf);
 	if (tok->input)
 		PyMem_FREE((char *)tok->input);
 	PyMem_FREE(tok);
 }
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@ -52,9 +52,10 @@ struct tok_state {
 #endif
 	const char* enc;
 	const char* str;
 	const char* input; /* Tokenizer's newline translated copy of the string. */
 };
-extern struct tok_state *PyTokenizer_FromString(const char *);
+extern struct tok_state *PyTokenizer_FromString(const char *, int);
 extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *);
 extern void PyTokenizer_Free(struct tok_state *);
 extern int PyTokenizer_Get(struct tok_state *, char **, char **);