Bug #2301: Don't try decoding the source code into the original

encoding for syntax errors.
2025-12-08 18:32:16 +00:00 · 2008-03-17 20:43:42 +00:00 · 2008-03-17 20:43:42 +00:00 · 2593146227
commit 2593146227
parent ddaa7064ee
4 changed files with 18 additions and 74 deletions
--- a/Lib/test/test_pep263.py
+++ b/Lib/test/test_pep263.py
@ -23,6 +23,13 @@ class PEP263Test(unittest.TestCase):
        exec(c, d)
        self.assertEqual(d['u'], '\xf3')
    def test_issue2301(self):
        try:
            compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec")
        except SyntaxError as v:
            self.assertEquals(v.text, "print '\u5e74'")
        else:
            self.fail()
 def test_main():
    test_support.run_unittest(PEP263Test)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -9,6 +9,12 @@ What's New in Python 3.0a4?
 *Release date: XX-XXX-2008*
 Core and Builtins
 -----------------
 - Bug #2301: Don't try decoding the source code into the original
  encoding for syntax errors.
 Extension Modules
 -----------------
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@ -213,22 +213,17 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 			err_ret->error = E_EOF;
 		err_ret->lineno = tok->lineno;
 		if (tok->buf != NULL) {
 			char *text = NULL;
 			size_t len;
 			assert(tok->cur - tok->buf < INT_MAX);
 			err_ret->offset = (int)(tok->cur - tok->buf);
 			len = tok->inp - tok->buf;
-			text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
+			err_ret->text = (char *) PyObject_MALLOC(len + 1);
-			if (text == NULL) {
+			if (err_ret->text != NULL) {
 				text = (char *) PyObject_MALLOC(len + 1);
 				if (text != NULL) {
 				if (len > 0)
-						strncpy(text, tok->buf, len);
+					strncpy(err_ret->text, tok->buf, len);
-					text[len] = '\0';
+				err_ret->text[len] = '\0';
 			}
 		}
 			err_ret->text = text;
 		}
 	} else if (tok->encoding != NULL) {
 		node* r = PyNode_New(encoding_decl);
 		if (!r) {
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -1579,70 +1579,6 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 	return result;
 }
 /* This function is only called from parsetok. However, it cannot live
   there, as it must be empty for PGEN, and we can check for PGEN only
   in this file. */
 #ifdef PGEN
 char*
 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
 {
 	return NULL;
 }
 #else
 static PyObject *
 dec_utf8(const char *enc, const char *text, size_t len) {
 	PyObject *ret = NULL;
 	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
 	if (unicode_text) {
 		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
 		Py_DECREF(unicode_text);
 	}
 	if (!ret) {
 		PyErr_Clear();
 	}
        else {
 		assert(PyString_Check(ret));
 	}
 	return ret;
 }
 char *
 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
 {
 	char *text = NULL;
 	if (tok->encoding) {
 		/* convert source to original encondig */
 		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
 		if (lineobj != NULL) {
 			int linelen = PyString_GET_SIZE(lineobj);
 			const char *line = PyString_AS_STRING(lineobj);
 			text = PyObject_MALLOC(linelen + 1);
 			if (text != NULL && line != NULL) {
 				if (linelen)
 					strncpy(text, line, linelen);
 				text[linelen] = '\0';
 			}
 			Py_DECREF(lineobj);
 			/* adjust error offset */
 			if (*offset > 1) {
 				PyObject *offsetobj = dec_utf8(tok->encoding,
 							       tok->buf,
 							       *offset-1);
 				if (offsetobj) {
 					*offset = 1 + Py_SIZE(offsetobj);
 					Py_DECREF(offsetobj);
 				}
 			}
 		}
 	}
 	return text;
 }
 #endif
 /* Get -*- encoding -*- from a Python file.
   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in