Issue 28128: Print out better error/warning messages for invalid string escapes.

2025-09-26 18:29:57 +00:00 · 2016-10-31 09:22:08 -04:00 · 2016-10-31 09:22:08 -04:00 · 42454af094
commit 42454af094
parent a99cdb21a7
8 changed files with 173 additions and 22 deletions
--- a/Include/bytesobject.h
+++ b/Include/bytesobject.h
@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
 PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
 						   const char *, Py_ssize_t,
 						   const char *);
 /* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
 PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
                                             const char *, Py_ssize_t,
                                             const char *,
                                             const char **);
 /* Macro, trading safety for speed */
 #ifndef Py_LIMITED_API
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
    const char *errors          /* error handling */
    );
 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
   chars. */
 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
        const char *string,     /* Unicode-Escape encoded string */
        Py_ssize_t length,      /* size of string */
        const char *errors,     /* error handling */
        const char **first_invalid_escape  /* on return, points to first
                                              invalid escaped char in
                                              string. */
 );
 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
    PyObject *unicode           /* Unicode object */
    );
--- a/Lib/test/test_string_literals.py
+++ b/Lib/test/test_string_literals.py
@ -31,6 +31,7 @@ import os
 import sys
 import shutil
 import tempfile
 import warnings
 import unittest
@ -104,6 +105,19 @@ class TestLiterals(unittest.TestCase):
        self.assertRaises(SyntaxError, eval, r""" '\U000000' """)
        self.assertRaises(SyntaxError, eval, r""" '\U0000000' """)
    def test_eval_str_invalid_escape(self):
        for b in range(1, 128):
            if b in b"""\n\r"'01234567NU\\abfnrtuvx""":
                continue
            with self.assertWarns(DeprecationWarning):
                self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b))
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always', category=DeprecationWarning)
            eval("'''\n\\z'''")
        self.assertEqual(len(w), 1)
        self.assertEqual(w[0].filename, '<string>')
        self.assertEqual(w[0].lineno, 2)
    def test_eval_str_raw(self):
        self.assertEqual(eval(""" r'x' """), 'x')
        self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@ -130,6 +144,19 @@ class TestLiterals(unittest.TestCase):
        self.assertRaises(SyntaxError, eval, r""" b'\x' """)
        self.assertRaises(SyntaxError, eval, r""" b'\x0' """)
    def test_eval_bytes_invalid_escape(self):
        for b in range(1, 128):
            if b in b"""\n\r"'01234567\\abfnrtvx""":
                continue
            with self.assertWarns(DeprecationWarning):
                self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b]))
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always', category=DeprecationWarning)
            eval("b'''\n\\z'''")
        self.assertEqual(len(w), 1)
        self.assertEqual(w[0].filename, '<string>')
        self.assertEqual(w[0].lineno, 2)
    def test_eval_bytes_raw(self):
        self.assertEqual(eval(""" br'x' """), b'x')
        self.assertEqual(eval(""" rb'x' """), b'x')
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -2413,13 +2413,6 @@ class UnicodeTest(string_tests.CommonTest,
        support.check_free_after_iterating(self, iter, str)
        support.check_free_after_iterating(self, reversed, str)
    def test_invalid_sequences(self):
        for letter in string.ascii_letters + "89": # 0-7 are octal escapes
            if letter in "abfnrtuvxNU":
                continue
            with self.assertWarns(DeprecationWarning):
                eval(r"'\%s'" % letter)
 class CAPITest(unittest.TestCase):
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1
 Core and Builtins
 -----------------
 - Issue #28128: Deprecation warning for invalid str and byte escape
  sequences now prints better information about where the error
  occurs. Patch by Serhiy Storchaka and Eric Smith.
 - Issue #28509: dict.update() no longer allocate unnecessary large memory.
 - Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@ -1105,11 +1105,12 @@ _PyBytes_DecodeEscapeRecode(const char **s, const char *end,
    return p;
 }
-PyObject *PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape(const char *s,
                                Py_ssize_t len,
                                const char *errors,
                                Py_ssize_t unicode,
-                                const char *recode_encoding)
+                                const char *recode_encoding,
                                const char **first_invalid_escape)
 {
    int c;
    char *p;
@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const char *s,
        return NULL;
    writer.overallocate = 1;
    *first_invalid_escape = NULL;
    end = s + len;
    while (s < end) {
        if (*s != '\\') {
@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const char *s,
            break;
        default:
-            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0)
+            if (*first_invalid_escape == NULL) {
-                goto failed;
+                *first_invalid_escape = s-1; /* Back up one char, since we've
                                                already incremented s. */
            }
            *p++ = '\\';
            s--;
            goto non_esc; /* an arbitrary number of unescaped
                             UTF-8 bytes may follow. */
        }
@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const char *s,
    return NULL;
 }
 PyObject *PyBytes_DecodeEscape(const char *s,
                                Py_ssize_t len,
                                const char *errors,
                                Py_ssize_t unicode,
                                const char *recode_encoding)
 {
    const char* first_invalid_escape;
    PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode,
                                             recode_encoding,
                                             &first_invalid_escape);
    if (result == NULL)
        return NULL;
    if (first_invalid_escape != NULL) {
        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                             "invalid escape sequence '\\%c'",
                             *first_invalid_escape) < 0) {
            Py_DECREF(result);
            return NULL;
        }
    }
    return result;
 }
 /* -------------------------------------------------------------------- */
 /* object api */
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -5877,9 +5877,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
 PyObject *
-PyUnicode_DecodeUnicodeEscape(const char *s,
+_PyUnicode_DecodeUnicodeEscape(const char *s,
                               Py_ssize_t size,
-                              const char *errors)
+                               const char *errors,
                               const char **first_invalid_escape)
 {
    const char *starts = s;
    _PyUnicodeWriter writer;
@ -5887,6 +5888,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    // so we can remember if we've seen an invalid escape char or not
    *first_invalid_escape = NULL;
    if (size == 0) {
        _Py_RETURN_UNICODE_EMPTY();
    }
@ -6061,9 +6065,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
            goto error;
        default:
-            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+            if (*first_invalid_escape == NULL) {
-                                 "invalid escape sequence '\\%c'", c) < 0)
+                *first_invalid_escape = s-1; /* Back up one char, since we've
-                goto onError;
+                                                already incremented s. */
            }
            WRITE_ASCII_CHAR('\\');
            WRITE_CHAR(c);
            continue;
@ -6098,6 +6103,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
    return NULL;
 }
 PyObject *
 PyUnicode_DecodeUnicodeEscape(const char *s,
                              Py_ssize_t size,
                              const char *errors)
 {
    const char *first_invalid_escape;
    PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
                                                      &first_invalid_escape);
    if (result == NULL)
        return NULL;
    if (first_invalid_escape != NULL) {
        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                             "invalid escape sequence '\\%c'",
                             *first_invalid_escape) < 0) {
            Py_DECREF(result);
            return NULL;
        }
    }
    return result;
 }
 /* Return a Unicode-Escape string version of the Unicode object.
   If quotes is true, the string is enclosed in u"" or u'' quotes as
--- a/Python/ast.c
+++ b/Python/ast.c
@ -4113,8 +4113,34 @@ decode_utf8(struct compiling *c, const char **sPtr, const char *end)
    return PyUnicode_DecodeUTF8(t, s - t, NULL);
 }
 static int
 warn_invalid_escape_sequence(struct compiling *c, const node *n,
                             char first_invalid_escape_char)
 {
    PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \\%c",
                                         first_invalid_escape_char);
    if (msg == NULL) {
        return -1;
    }
    if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg,
                                   c->c_filename, LINENO(n),
                                   NULL, NULL) < 0 &&
        PyErr_ExceptionMatches(PyExc_DeprecationWarning))
    {
        const char *s = PyUnicode_AsUTF8(msg);
        if (s != NULL) {
            ast_error(c, n, s);
        }
        Py_DECREF(msg);
        return -1;
    }
    Py_DECREF(msg);
    return 0;
 }
 static PyObject *
-decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
+decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s,
                            size_t len)
 {
    PyObject *v, *u;
    char *buf;
@ -4167,11 +4193,41 @@ decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
    len = p - buf;
    s = buf;
-    v = PyUnicode_DecodeUnicodeEscape(s, len, NULL);
+    const char *first_invalid_escape;
    v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
    if (v != NULL && first_invalid_escape != NULL) {
        if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
            /* We have not decref u before because first_invalid_escape points
               inside u. */
            Py_XDECREF(u);
            Py_DECREF(v);
            return NULL;
        }
    }
    Py_XDECREF(u);
    return v;
 }
 static PyObject *
 decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s,
                          size_t len)
 {
    const char *first_invalid_escape;
    PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL,
                                             &first_invalid_escape);
    if (result == NULL)
        return NULL;
    if (first_invalid_escape != NULL) {
        if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
            Py_DECREF(result);
            return NULL;
        }
    }
    return result;
 }
 /* Compile this expression in to an expr_ty.  Add parens around the
   expression, in order to allow leading spaces in the expression. */
 static expr_ty
@ -4310,7 +4366,7 @@ done:
                                                    literal_end-literal_start,
                                                    NULL, NULL);
        else
-            *literal = decode_unicode_with_escapes(c, literal_start,
+            *literal = decode_unicode_with_escapes(c, n, literal_start,
                                                   literal_end-literal_start);
        if (!*literal)
            return -1;
@ -5048,12 +5104,12 @@ parsestr(struct compiling *c, const node *n, int *bytesmode, int *rawmode,
        if (*rawmode)
            *result = PyBytes_FromStringAndSize(s, len);
        else
-            *result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL);
+            *result = decode_bytes_with_escapes(c, n, s, len);
    } else {
        if (*rawmode)
            *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
        else
-            *result = decode_unicode_with_escapes(c, s, len);
+            *result = decode_unicode_with_escapes(c, n, s, len);
    }
    return *result == NULL ? -1 : 0;
 }