Patch #1031213: Decode source line in SyntaxErrors back to its original

source encoding. Will backport to 2.5.
2025-09-16 21:56:14 +00:00 · 2007-09-04 14:19:28 +00:00 · 2007-09-04 14:19:28 +00:00 · a5136196bc
commit a5136196bc
parent 58bd49f5fe
6 changed files with 107 additions and 5 deletions
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 	return result;
 }

+/* This function is only called from parsetok. However, it cannot live
+   there, as it must be empty for PGEN, and we can check for PGEN only
+   in this file. */
+
+#ifdef PGEN
+char*
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
+{
+	return NULL;
+}
+#else
+static PyObject *
+dec_utf8(const char *enc, const char *text, size_t len) {
+	PyObject *ret = NULL;	
+	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
+	if (unicode_text) {
+		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
+		Py_DECREF(unicode_text);
+	}
+	if (!ret) {
+		PyErr_Print();
+	}
+	return ret;
+}
+
+char *
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
+{
+	char *text = NULL;
+	if (tok->encoding) {
+		/* convert source to original encondig */
+		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
+		if (lineobj != NULL) {
+			int linelen = PyString_Size(lineobj);
+			const char *line = PyString_AsString(lineobj);
+			text = PyObject_MALLOC(linelen + 1);
+			if (text != NULL && line != NULL) {
+				if (linelen)
+					strncpy(text, line, linelen);
+				text[linelen] = '\0';
+			}
+			Py_DECREF(lineobj);
+					
+			/* adjust error offset */
+			if (*offset > 1) {
+				PyObject *offsetobj = dec_utf8(tok->encoding, 
+							       tok->buf, *offset-1);
+				if (offsetobj) {
+					*offset = PyString_Size(offsetobj) + 1;
+					Py_DECREF(offsetobj);
+				}
+			}
+			
+		}
+	}
+	return text;
+
+}
+#endif
+
+			   
+
 #ifdef Py_DEBUG

 void