Patch #1031213: Decode source line in SyntaxErrors back to its original

source encoding. Will backport to 2.5.
2025-09-17 14:16:02 +00:00 · 2007-09-04 14:19:28 +00:00 · 2007-09-04 14:19:28 +00:00 · a5136196bc
commit a5136196bc
parent 58bd49f5fe
6 changed files with 107 additions and 5 deletions
--- a/Lib/test/test_compiler.py
+++ b/Lib/test/test_compiler.py
@ -155,6 +155,32 @@ class CompilerTest(unittest.TestCase):
        self.assertEquals(dct.get('result'), 1)
    def _testErrEnc(self, src, text, offset):
        try:
            compile(src, "", "exec")
        except SyntaxError, e:
            self.assertEquals(e.offset, offset)
            self.assertEquals(e.text, text)
    def testSourceCodeEncodingsError(self):
        # Test SyntaxError with encoding definition
        sjis = "print '\x83\x70\x83\x43\x83\x5c\x83\x93', '\n"
        ascii = "print '12345678', '\n"
        encdef = "#! -*- coding: ShiftJIS -*-\n"
        # ascii source without encdef
        self._testErrEnc(ascii, ascii, 19)
        # ascii source with encdef
        self._testErrEnc(encdef+ascii, ascii, 19)
        # non-ascii source with encdef
        self._testErrEnc(encdef+sjis, sjis, 19)
        # ShiftJIS source without encdef
        self._testErrEnc(sjis, sjis, 19)
 NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard)
 ###############################################################################
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -320,6 +320,7 @@ Lars Immisch
 Tony Ingraldi
 John Interrante
 Bob Ippolito
 Atsuo Ishimoto
 Ben Jackson
 Paul Jackson
 David Jacobs
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,9 @@ What's New in Python 2.6 alpha 1?
 Core and builtins
 -----------------
 - Patch #1031213: Decode source line in SyntaxErrors back to its original source
  encoding.
 - Py_ssize_t fields work in structmember when HAVE_LONG_LONG is not defined.
 - PEP 3123: Provide forward compatibility with Python 3.0, while keeping
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@ -218,16 +218,24 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 			err_ret->error = E_EOF;
 		err_ret->lineno = tok->lineno;
 		if (tok->buf != NULL) {
 			char *text = NULL;
 			size_t len;
 			assert(tok->cur - tok->buf < INT_MAX);
 			err_ret->offset = (int)(tok->cur - tok->buf);
 			len = tok->inp - tok->buf;
-			err_ret->text = (char *) PyObject_MALLOC(len + 1);
+#ifdef Py_USING_UNICODE
-			if (err_ret->text != NULL) {
+			text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
-				if (len > 0)
+
-					strncpy(err_ret->text, tok->buf, len);
+#endif
-				err_ret->text[len] = '\0';
+			if (text == NULL) {
 				text = (char *) PyObject_MALLOC(len + 1);
 				if (text != NULL) {
 					if (len > 0)
 						strncpy(text, tok->buf, len);
 					text[len] = '\0';
 				}
 			}
 			err_ret->text = text;
 		}
 	} else if (tok->encoding != NULL) {
 		node* r = PyNode_New(encoding_decl);
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 	return result;
 }
 /* This function is only called from parsetok. However, it cannot live
   there, as it must be empty for PGEN, and we can check for PGEN only
   in this file. */
 #ifdef PGEN
 char*
 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
 {
 	return NULL;
 }
 #else
 static PyObject *
 dec_utf8(const char *enc, const char *text, size_t len) {
 	PyObject *ret = NULL;	
 	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
 	if (unicode_text) {
 		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
 		Py_DECREF(unicode_text);
 	}
 	if (!ret) {
 		PyErr_Print();
 	}
 	return ret;
 }
 char *
 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
 {
 	char *text = NULL;
 	if (tok->encoding) {
 		/* convert source to original encondig */
 		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
 		if (lineobj != NULL) {
 			int linelen = PyString_Size(lineobj);
 			const char *line = PyString_AsString(lineobj);
 			text = PyObject_MALLOC(linelen + 1);
 			if (text != NULL && line != NULL) {
 				if (linelen)
 					strncpy(text, line, linelen);
 				text[linelen] = '\0';
 			}
 			Py_DECREF(lineobj);
 			/* adjust error offset */
 			if (*offset > 1) {
 				PyObject *offsetobj = dec_utf8(tok->encoding, 
 							       tok->buf, *offset-1);
 				if (offsetobj) {
 					*offset = PyString_Size(offsetobj) + 1;
 					Py_DECREF(offsetobj);
 				}
 			}
 		}
 	}
 	return text;
 }
 #endif
 #ifdef Py_DEBUG
 void
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@ -58,6 +58,8 @@ extern struct tok_state *PyTokenizer_FromString(const char *);
 extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *);
 extern void PyTokenizer_Free(struct tok_state *);
 extern int PyTokenizer_Get(struct tok_state *, char **, char **);
 extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, 
 					  int len, int *offset);
 #ifdef __cplusplus
 }