From 904af3de2bef6d971463a564541cb6dadf22d7f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Sat, 20 Nov 2021 16:34:56 +0100 Subject: [PATCH] [3.10] bpo-45848: Allow the parser to get error lines from encoded files (GH-29646) (GH-29661) (cherry picked from commit fdcc46d9554094994f78bedf6dc9220e5d5ee668) Co-authored-by: Pablo Galindo Salgado --- .gitignore | 6 ++++++ Include/cpython/pyerrors.h | 6 ++++++ Lib/test/test_exceptions.py | 13 +++++++++++++ .../2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst | 2 ++ Parser/pegen.c | 15 ++++++++------- Python/errors.c | 18 ++++++++++++++---- 6 files changed, 49 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst diff --git a/.gitignore b/.gitignore index d0b608892a8..19b4214a9ae 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,9 @@ Tools/ssl/win32 # Ignore ./python binary on Unix but still look into ./Python/ directory. /python !/Python/ + +# Artifacts generated by 3.11 lying around when switching branches: +/_bootstrap_python +/Programs/_freeze_module +/Python/deepfreeze/ +/Python/frozen_modules/ \ No newline at end of file diff --git a/Include/cpython/pyerrors.h b/Include/cpython/pyerrors.h index 5e57129c3b8..3f952456679 100644 --- a/Include/cpython/pyerrors.h +++ b/Include/cpython/pyerrors.h @@ -185,6 +185,12 @@ Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_Create( Py_ssize_t end, const char *reason /* UTF-8 encoded string */ ); + +PyAPI_FUNC(PyObject *) _PyErr_ProgramDecodedTextObject( + PyObject *filename, + int lineno, + const char* encoding); + PyAPI_FUNC(PyObject *) _PyUnicodeTranslateError_Create( PyObject *object, Py_ssize_t start, diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 8419f582cbd..04c883cf53b 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -2352,6 +2352,19 @@ class SyntaxErrorTests(unittest.TestCase): finally: unlink(TESTFN) + # Check backwards tokenizer errors + source = '# -*- coding: ascii -*-\n\n(\n' + try: + with open(TESTFN, 'w', encoding='ascii') as testfile: + testfile.write(source) + rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN) + err = err.decode('utf-8').splitlines() + + self.assertEqual(err[-3], ' (') + self.assertEqual(err[-2], ' ^') + finally: + unlink(TESTFN) + def test_attributes_new_constructor(self): args = ("bad.py", 1, 2, "abcdefg", 1, 100) the_exception = SyntaxError("bad bad", args) diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst b/Misc/NEWS.d/next/Core and Builtins/2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst new file mode 100644 index 00000000000..d9394c9c1f0 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst @@ -0,0 +1,2 @@ +Allow the parser to obtain error lines directly from encoded files. Patch by +Pablo Galindo diff --git a/Parser/pegen.c b/Parser/pegen.c index c6570eb1bd0..9bf4fe7ecd8 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -480,14 +480,12 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, goto error; } - // PyErr_ProgramTextObject assumes that the text is utf-8 so we cannot call it with a file - // with an arbitrary encoding or otherwise we could get some badly decoded text. - int uses_utf8_codec = (!p->tok->encoding || strcmp(p->tok->encoding, "utf-8") == 0); if (p->tok->fp_interactive) { error_line = get_error_line(p, lineno); } - else if (uses_utf8_codec && p->start_rule == Py_file_input) { - error_line = PyErr_ProgramTextObject(p->tok->filename, (int) lineno); + else if (p->start_rule == Py_file_input) { + error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, + (int) lineno, p->tok->encoding); } if (!error_line) { @@ -498,15 +496,18 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, we're actually parsing from a file, which has an E_EOF SyntaxError and in that case `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which does not physically exist */ - assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF || !uses_utf8_codec); + assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { Py_ssize_t size = p->tok->inp - p->tok->buf; error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); } - else { + else if (p->tok->fp == NULL || p->tok->fp == stdin) { error_line = get_error_line(p, lineno); } + else { + error_line = PyUnicode_FromStringAndSize("", 0); + } if (!error_line) { goto error; } diff --git a/Python/errors.c b/Python/errors.c index 600300e263d..bc1b55e440e 100644 --- a/Python/errors.c +++ b/Python/errors.c @@ -1724,7 +1724,7 @@ PyErr_SyntaxLocationEx(const char *filename, int lineno, int col_offset) functionality in tb_displayline() in traceback.c. */ static PyObject * -err_programtext(PyThreadState *tstate, FILE *fp, int lineno) +err_programtext(PyThreadState *tstate, FILE *fp, int lineno, const char* encoding) { int i; char linebuf[1000]; @@ -1752,7 +1752,11 @@ after_loop: fclose(fp); if (i == lineno) { PyObject *res; - res = PyUnicode_FromString(linebuf); + if (encoding != NULL) { + res = PyUnicode_Decode(linebuf, strlen(linebuf), encoding, "replace"); + } else { + res = PyUnicode_FromString(linebuf); + } if (res == NULL) _PyErr_Clear(tstate); return res; @@ -1778,7 +1782,7 @@ PyErr_ProgramText(const char *filename, int lineno) } PyObject * -PyErr_ProgramTextObject(PyObject *filename, int lineno) +_PyErr_ProgramDecodedTextObject(PyObject *filename, int lineno, const char* encoding) { if (filename == NULL || lineno <= 0) { return NULL; @@ -1790,7 +1794,13 @@ PyErr_ProgramTextObject(PyObject *filename, int lineno) _PyErr_Clear(tstate); return NULL; } - return err_programtext(tstate, fp, lineno); + return err_programtext(tstate, fp, lineno, encoding); +} + +PyObject * +PyErr_ProgramTextObject(PyObject *filename, int lineno) +{ + return _PyErr_ProgramDecodedTextObject(filename, lineno, NULL); } #ifdef __cplusplus