Issue 28128: Print out better error/warning messages for invalid string escapes. Backport to 3.6.

This commit is contained in:
Eric V. Smith 2016-10-31 14:46:26 -04:00
parent 7f0514ad54
commit 5646648678
8 changed files with 173 additions and 22 deletions

View file

@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
const char *, Py_ssize_t, const char *, Py_ssize_t,
const char *); const char *);
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
const char *, Py_ssize_t,
const char *,
const char **);
/* Macro, trading safety for speed */ /* Macro, trading safety for speed */
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API

View file

@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
const char *errors /* error handling */ const char *errors /* error handling */
); );
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
chars. */
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
const char **first_invalid_escape /* on return, points to first
invalid escaped char in
string. */
);
PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
PyObject *unicode /* Unicode object */ PyObject *unicode /* Unicode object */
); );

View file

@ -31,6 +31,7 @@ import os
import sys import sys
import shutil import shutil
import tempfile import tempfile
import warnings
import unittest import unittest
@ -104,6 +105,19 @@ class TestLiterals(unittest.TestCase):
self.assertRaises(SyntaxError, eval, r""" '\U000000' """) self.assertRaises(SyntaxError, eval, r""" '\U000000' """)
self.assertRaises(SyntaxError, eval, r""" '\U0000000' """) self.assertRaises(SyntaxError, eval, r""" '\U0000000' """)
def test_eval_str_invalid_escape(self):
for b in range(1, 128):
if b in b"""\n\r"'01234567NU\\abfnrtuvx""":
continue
with self.assertWarns(DeprecationWarning):
self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b))
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always', category=DeprecationWarning)
eval("'''\n\\z'''")
self.assertEqual(len(w), 1)
self.assertEqual(w[0].filename, '<string>')
self.assertEqual(w[0].lineno, 2)
def test_eval_str_raw(self): def test_eval_str_raw(self):
self.assertEqual(eval(""" r'x' """), 'x') self.assertEqual(eval(""" r'x' """), 'x')
self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01') self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@ -130,6 +144,19 @@ class TestLiterals(unittest.TestCase):
self.assertRaises(SyntaxError, eval, r""" b'\x' """) self.assertRaises(SyntaxError, eval, r""" b'\x' """)
self.assertRaises(SyntaxError, eval, r""" b'\x0' """) self.assertRaises(SyntaxError, eval, r""" b'\x0' """)
def test_eval_bytes_invalid_escape(self):
for b in range(1, 128):
if b in b"""\n\r"'01234567\\abfnrtvx""":
continue
with self.assertWarns(DeprecationWarning):
self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b]))
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always', category=DeprecationWarning)
eval("b'''\n\\z'''")
self.assertEqual(len(w), 1)
self.assertEqual(w[0].filename, '<string>')
self.assertEqual(w[0].lineno, 2)
def test_eval_bytes_raw(self): def test_eval_bytes_raw(self):
self.assertEqual(eval(""" br'x' """), b'x') self.assertEqual(eval(""" br'x' """), b'x')
self.assertEqual(eval(""" rb'x' """), b'x') self.assertEqual(eval(""" rb'x' """), b'x')

View file

@ -2413,13 +2413,6 @@ class UnicodeTest(string_tests.CommonTest,
support.check_free_after_iterating(self, iter, str) support.check_free_after_iterating(self, iter, str)
support.check_free_after_iterating(self, reversed, str) support.check_free_after_iterating(self, reversed, str)
def test_invalid_sequences(self):
for letter in string.ascii_letters + "89": # 0-7 are octal escapes
if letter in "abfnrtuvxNU":
continue
with self.assertWarns(DeprecationWarning):
eval(r"'\%s'" % letter)
class CAPITest(unittest.TestCase): class CAPITest(unittest.TestCase):

View file

@ -10,6 +10,10 @@ What's New in Python 3.6.0 beta 3
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #28128: Deprecation warning for invalid str and byte escape
sequences now prints better information about where the error
occurs. Patch by Serhiy Storchaka and Eric Smith.
- Issue #28509: dict.update() no longer allocate unnecessary large memory. - Issue #28509: dict.update() no longer allocate unnecessary large memory.
- Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug - Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug

View file

@ -1105,11 +1105,12 @@ _PyBytes_DecodeEscapeRecode(const char **s, const char *end,
return p; return p;
} }
PyObject *PyBytes_DecodeEscape(const char *s, PyObject *_PyBytes_DecodeEscape(const char *s,
Py_ssize_t len, Py_ssize_t len,
const char *errors, const char *errors,
Py_ssize_t unicode, Py_ssize_t unicode,
const char *recode_encoding) const char *recode_encoding,
const char **first_invalid_escape)
{ {
int c; int c;
char *p; char *p;
@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const char *s,
return NULL; return NULL;
writer.overallocate = 1; writer.overallocate = 1;
*first_invalid_escape = NULL;
end = s + len; end = s + len;
while (s < end) { while (s < end) {
if (*s != '\\') { if (*s != '\\') {
@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const char *s,
break; break;
default: default:
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0) if (*first_invalid_escape == NULL) {
goto failed; *first_invalid_escape = s-1; /* Back up one char, since we've
already incremented s. */
}
*p++ = '\\'; *p++ = '\\';
s--;
goto non_esc; /* an arbitrary number of unescaped goto non_esc; /* an arbitrary number of unescaped
UTF-8 bytes may follow. */ UTF-8 bytes may follow. */
} }
@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const char *s,
return NULL; return NULL;
} }
PyObject *PyBytes_DecodeEscape(const char *s,
Py_ssize_t len,
const char *errors,
Py_ssize_t unicode,
const char *recode_encoding)
{
const char* first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode,
recode_encoding,
&first_invalid_escape);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
*first_invalid_escape) < 0) {
Py_DECREF(result);
return NULL;
}
}
return result;
}
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
/* object api */ /* object api */

View file

@ -5896,9 +5896,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
PyObject * PyObject *
PyUnicode_DecodeUnicodeEscape(const char *s, _PyUnicode_DecodeUnicodeEscape(const char *s,
Py_ssize_t size, Py_ssize_t size,
const char *errors) const char *errors,
const char **first_invalid_escape)
{ {
const char *starts = s; const char *starts = s;
_PyUnicodeWriter writer; _PyUnicodeWriter writer;
@ -5906,6 +5907,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
// so we can remember if we've seen an invalid escape char or not
*first_invalid_escape = NULL;
if (size == 0) { if (size == 0) {
_Py_RETURN_UNICODE_EMPTY(); _Py_RETURN_UNICODE_EMPTY();
} }
@ -6080,9 +6084,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
goto error; goto error;
default: default:
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, if (*first_invalid_escape == NULL) {
"invalid escape sequence '\\%c'", c) < 0) *first_invalid_escape = s-1; /* Back up one char, since we've
goto onError; already incremented s. */
}
WRITE_ASCII_CHAR('\\'); WRITE_ASCII_CHAR('\\');
WRITE_CHAR(c); WRITE_CHAR(c);
continue; continue;
@ -6117,6 +6122,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
return NULL; return NULL;
} }
PyObject *
PyUnicode_DecodeUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
{
const char *first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
&first_invalid_escape);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
*first_invalid_escape) < 0) {
Py_DECREF(result);
return NULL;
}
}
return result;
}
/* Return a Unicode-Escape string version of the Unicode object. /* Return a Unicode-Escape string version of the Unicode object.
If quotes is true, the string is enclosed in u"" or u'' quotes as If quotes is true, the string is enclosed in u"" or u'' quotes as

View file

@ -4113,8 +4113,34 @@ decode_utf8(struct compiling *c, const char **sPtr, const char *end)
return PyUnicode_DecodeUTF8(t, s - t, NULL); return PyUnicode_DecodeUTF8(t, s - t, NULL);
} }
static int
warn_invalid_escape_sequence(struct compiling *c, const node *n,
char first_invalid_escape_char)
{
PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \\%c",
first_invalid_escape_char);
if (msg == NULL) {
return -1;
}
if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg,
c->c_filename, LINENO(n),
NULL, NULL) < 0 &&
PyErr_ExceptionMatches(PyExc_DeprecationWarning))
{
const char *s = PyUnicode_AsUTF8(msg);
if (s != NULL) {
ast_error(c, n, s);
}
Py_DECREF(msg);
return -1;
}
Py_DECREF(msg);
return 0;
}
static PyObject * static PyObject *
decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len) decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s,
size_t len)
{ {
PyObject *v, *u; PyObject *v, *u;
char *buf; char *buf;
@ -4167,11 +4193,41 @@ decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
len = p - buf; len = p - buf;
s = buf; s = buf;
v = PyUnicode_DecodeUnicodeEscape(s, len, NULL); const char *first_invalid_escape;
v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
if (v != NULL && first_invalid_escape != NULL) {
if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
/* We have not decref u before because first_invalid_escape points
inside u. */
Py_XDECREF(u);
Py_DECREF(v);
return NULL;
}
}
Py_XDECREF(u); Py_XDECREF(u);
return v; return v;
} }
static PyObject *
decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s,
size_t len)
{
const char *first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL,
&first_invalid_escape);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
Py_DECREF(result);
return NULL;
}
}
return result;
}
/* Compile this expression in to an expr_ty. Add parens around the /* Compile this expression in to an expr_ty. Add parens around the
expression, in order to allow leading spaces in the expression. */ expression, in order to allow leading spaces in the expression. */
static expr_ty static expr_ty
@ -4310,7 +4366,7 @@ done:
literal_end-literal_start, literal_end-literal_start,
NULL, NULL); NULL, NULL);
else else
*literal = decode_unicode_with_escapes(c, literal_start, *literal = decode_unicode_with_escapes(c, n, literal_start,
literal_end-literal_start); literal_end-literal_start);
if (!*literal) if (!*literal)
return -1; return -1;
@ -5048,12 +5104,12 @@ parsestr(struct compiling *c, const node *n, int *bytesmode, int *rawmode,
if (*rawmode) if (*rawmode)
*result = PyBytes_FromStringAndSize(s, len); *result = PyBytes_FromStringAndSize(s, len);
else else
*result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL); *result = decode_bytes_with_escapes(c, n, s, len);
} else { } else {
if (*rawmode) if (*rawmode)
*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
else else
*result = decode_unicode_with_escapes(c, s, len); *result = decode_unicode_with_escapes(c, n, s, len);
} }
return *result == NULL ? -1 : 0; return *result == NULL ? -1 : 0;
} }