mirror of
https://github.com/python/cpython.git
synced 2025-10-17 04:08:28 +00:00
Minimal change that disables (AFAICT) the interpolation of \u and \U inside
raw string literals. I added a whole bunch of tests but am still not sure I am testing all paths through the code. I really think the code could be simplified quite a bit.
This commit is contained in:
parent
aec75c33b7
commit
29fd7120e4
2 changed files with 160 additions and 8 deletions
148
Lib/test/test_strlit.py
Normal file
148
Lib/test/test_strlit.py
Normal file
|
@ -0,0 +1,148 @@
|
||||||
|
r"""Test correct treatment of various string literals by the parser.
|
||||||
|
|
||||||
|
There are four types of string literals:
|
||||||
|
|
||||||
|
'abc' -- normal str
|
||||||
|
r'abc' -- raw str
|
||||||
|
b'xyz' -- normal bytes
|
||||||
|
br'xyz' -- raw bytes
|
||||||
|
|
||||||
|
The difference between normal and raw strings is of course that in a
|
||||||
|
raw string, \ escapes (while still used to determine the end of the
|
||||||
|
literal) are not interpreted, so that r'\x00' contains four
|
||||||
|
characters: a backslash, an x, and two zeros; while '\x00' contains a
|
||||||
|
single character (code point zero).
|
||||||
|
|
||||||
|
The tricky thing is what should happen when non-ASCII bytes are used
|
||||||
|
inside literals. For bytes literals, this is considered illegal. But
|
||||||
|
for str literals, those bytes are supposed to be decoded using the
|
||||||
|
encoding declared for the file (UTF-8 by default).
|
||||||
|
|
||||||
|
We have to test this with various file encodings. We also test it with
|
||||||
|
exec()/eval(), which uses a different code path.
|
||||||
|
|
||||||
|
This file is really about correct treatment of encodings and
|
||||||
|
backslashes. It doens't concern itself with issues like single
|
||||||
|
vs. double quotes or singly- vs. triply-quoted strings: that's dealt
|
||||||
|
with elsewhere (I assume).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
TEMPLATE = r"""# coding: %s
|
||||||
|
a = 'x'
|
||||||
|
assert ord(a) == 120
|
||||||
|
b = '\x01'
|
||||||
|
assert ord(b) == 1
|
||||||
|
c = r'\x01'
|
||||||
|
assert list(map(ord, c)) == [92, 120, 48, 49]
|
||||||
|
d = '\x81'
|
||||||
|
assert ord(d) == 0x81
|
||||||
|
e = r'\x81'
|
||||||
|
assert list(map(ord, e)) == [92, 120, 56, 49]
|
||||||
|
f = '\u1881'
|
||||||
|
assert ord(f) == 0x1881
|
||||||
|
g = r'\u1881'
|
||||||
|
assert list(map(ord, g)) == [92, 117, 49, 56, 56, 49]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def byte(i):
|
||||||
|
return bytes([i])
|
||||||
|
|
||||||
|
|
||||||
|
class TestLiterals(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.save_path = sys.path[:]
|
||||||
|
self.tmpdir = tempfile.mkdtemp()
|
||||||
|
sys.path.insert(0, self.tmpdir)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
sys.path = self.save_path
|
||||||
|
shutil.rmtree(self.tmpdir, ignore_errors=True)
|
||||||
|
|
||||||
|
def test_template(self):
|
||||||
|
# Check that the template doesn't contain any non-printables
|
||||||
|
# except for \n.
|
||||||
|
for c in TEMPLATE:
|
||||||
|
assert c == '\n' or ' ' <= c <= '~', repr(c)
|
||||||
|
|
||||||
|
def test_eval_str_normal(self):
|
||||||
|
self.assertEqual(eval(""" 'x' """), 'x')
|
||||||
|
self.assertEqual(eval(r""" '\x01' """), chr(1))
|
||||||
|
self.assertEqual(eval(""" '\x01' """), chr(1))
|
||||||
|
self.assertEqual(eval(r""" '\x81' """), chr(0x81))
|
||||||
|
self.assertEqual(eval(""" '\x81' """), chr(0x81))
|
||||||
|
self.assertEqual(eval(r""" '\u1881' """), chr(0x1881))
|
||||||
|
self.assertEqual(eval(""" '\u1881' """), chr(0x1881))
|
||||||
|
|
||||||
|
def test_eval_str_raw(self):
|
||||||
|
self.assertEqual(eval(""" r'x' """), 'x')
|
||||||
|
self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
|
||||||
|
self.assertEqual(eval(""" r'\x01' """), chr(1))
|
||||||
|
self.assertEqual(eval(r""" r'\x81' """), '\\' + 'x81')
|
||||||
|
self.assertEqual(eval(""" r'\x81' """), chr(0x81))
|
||||||
|
self.assertEqual(eval(r""" r'\u1881' """), '\\' + 'u1881')
|
||||||
|
self.assertEqual(eval(""" r'\u1881' """), chr(0x1881))
|
||||||
|
|
||||||
|
def test_eval_bytes_normal(self):
|
||||||
|
self.assertEqual(eval(""" b'x' """), b'x')
|
||||||
|
self.assertEqual(eval(r""" b'\x01' """), byte(1))
|
||||||
|
self.assertEqual(eval(""" b'\x01' """), byte(1))
|
||||||
|
self.assertEqual(eval(r""" b'\x81' """), byte(0x81))
|
||||||
|
self.assertRaises(SyntaxError, eval, """ b'\x81' """)
|
||||||
|
self.assertEqual(eval(r""" b'\u1881' """), b'\\' + b'u1881')
|
||||||
|
self.assertRaises(SyntaxError, eval, """ b'\u1881' """)
|
||||||
|
|
||||||
|
def test_eval_bytes_raw(self):
|
||||||
|
self.assertEqual(eval(""" br'x' """), b'x')
|
||||||
|
self.assertEqual(eval(r""" br'\x01' """), b'\\' + b'x01')
|
||||||
|
self.assertEqual(eval(""" br'\x01' """), byte(1))
|
||||||
|
self.assertEqual(eval(r""" br'\x81' """), b"\\" + b"x81")
|
||||||
|
self.assertRaises(SyntaxError, eval, """ br'\x81' """)
|
||||||
|
self.assertEqual(eval(r""" br'\u1881' """), b"\\" + b"u1881")
|
||||||
|
self.assertRaises(SyntaxError, eval, """ br'\u1881' """)
|
||||||
|
|
||||||
|
def check_encoding(self, encoding, extra=""):
|
||||||
|
modname = "xx_" + encoding.replace("-", "_")
|
||||||
|
fn = os.path.join(self.tmpdir, modname + ".py")
|
||||||
|
f = open(fn, "w", encoding=encoding)
|
||||||
|
try:
|
||||||
|
f.write(TEMPLATE % encoding)
|
||||||
|
f.write(extra)
|
||||||
|
finally:
|
||||||
|
f.close()
|
||||||
|
__import__(modname)
|
||||||
|
del sys.modules[modname]
|
||||||
|
|
||||||
|
def test_file_utf_8(self):
|
||||||
|
extra = "z = '\u1234'; assert ord(z) == 0x1234\n"
|
||||||
|
self.check_encoding("utf-8", extra)
|
||||||
|
|
||||||
|
def test_file_utf_8_error(self):
|
||||||
|
extra = "b'\x80'\n"
|
||||||
|
self.assertRaises(SyntaxError, self.check_encoding, "utf-8", extra)
|
||||||
|
|
||||||
|
def test_file_utf8(self):
|
||||||
|
self.check_encoding("utf8")
|
||||||
|
|
||||||
|
def test_file_iso_8859_1(self):
|
||||||
|
self.check_encoding("iso-8859-1")
|
||||||
|
|
||||||
|
def test_file_latin_1(self):
|
||||||
|
self.check_encoding("latin-1")
|
||||||
|
|
||||||
|
def test_file_latin9(self):
|
||||||
|
self.check_encoding("latin9")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Hack so that error messages containing non-ASCII can be printed
|
||||||
|
sys.stdout._encoding = sys.stderr._encoding = "utf-8"
|
||||||
|
unittest.main()
|
20
Python/ast.c
20
Python/ast.c
|
@ -1292,7 +1292,7 @@ ast_for_atom(struct compiling *c, const node *n)
|
||||||
case STRING: {
|
case STRING: {
|
||||||
PyObject *str = parsestrplus(c, n, &bytesmode);
|
PyObject *str = parsestrplus(c, n, &bytesmode);
|
||||||
if (!str) {
|
if (!str) {
|
||||||
if (PyErr_ExceptionMatches(PyExc_UnicodeError)){
|
if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
|
||||||
PyObject *type, *value, *tback, *errstr;
|
PyObject *type, *value, *tback, *errstr;
|
||||||
PyErr_Fetch(&type, &value, &tback);
|
PyErr_Fetch(&type, &value, &tback);
|
||||||
errstr = ((PyUnicodeErrorObject *)value)->reason;
|
errstr = ((PyUnicodeErrorObject *)value)->reason;
|
||||||
|
@ -3117,6 +3117,7 @@ decode_unicode(const char *s, size_t len, int rawmode, const char *encoding)
|
||||||
char *buf;
|
char *buf;
|
||||||
char *p;
|
char *p;
|
||||||
const char *end;
|
const char *end;
|
||||||
|
|
||||||
if (encoding == NULL) {
|
if (encoding == NULL) {
|
||||||
buf = (char *)s;
|
buf = (char *)s;
|
||||||
u = NULL;
|
u = NULL;
|
||||||
|
@ -3218,7 +3219,7 @@ parsestr(const node *n, const char *encoding, int *bytesmode)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!*bytesmode) {
|
if (!*bytesmode && !rawmode) {
|
||||||
return decode_unicode(s, len, rawmode, encoding);
|
return decode_unicode(s, len, rawmode, encoding);
|
||||||
}
|
}
|
||||||
if (*bytesmode) {
|
if (*bytesmode) {
|
||||||
|
@ -3238,13 +3239,17 @@ parsestr(const node *n, const char *encoding, int *bytesmode)
|
||||||
if (rawmode || strchr(s, '\\') == NULL) {
|
if (rawmode || strchr(s, '\\') == NULL) {
|
||||||
if (need_encoding) {
|
if (need_encoding) {
|
||||||
PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);
|
PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);
|
||||||
if (u == NULL)
|
if (u == NULL || !*bytesmode)
|
||||||
return NULL;
|
return u;
|
||||||
v = PyUnicode_AsEncodedString(u, encoding, NULL);
|
v = PyUnicode_AsEncodedString(u, encoding, NULL);
|
||||||
Py_DECREF(u);
|
Py_DECREF(u);
|
||||||
return v;
|
return v;
|
||||||
} else {
|
} else if (*bytesmode) {
|
||||||
return PyString_FromStringAndSize(s, len);
|
return PyString_FromStringAndSize(s, len);
|
||||||
|
} else if (strcmp(encoding, "utf-8") == 0) {
|
||||||
|
return PyUnicode_FromStringAndSize(s, len);
|
||||||
|
} else {
|
||||||
|
return PyUnicode_DecodeLatin1(s, len, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3252,7 +3257,7 @@ parsestr(const node *n, const char *encoding, int *bytesmode)
|
||||||
need_encoding ? encoding : NULL);
|
need_encoding ? encoding : NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Build a Python string object out of a STRING atom. This takes care of
|
/* Build a Python string object out of a STRING+ atom. This takes care of
|
||||||
* compile-time literal catenation, calling parsestr() on each piece, and
|
* compile-time literal catenation, calling parsestr() on each piece, and
|
||||||
* pasting the intermediate results together.
|
* pasting the intermediate results together.
|
||||||
*/
|
*/
|
||||||
|
@ -3272,8 +3277,7 @@ parsestrplus(struct compiling *c, const node *n, int *bytesmode)
|
||||||
if (s == NULL)
|
if (s == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
if (*bytesmode != subbm) {
|
if (*bytesmode != subbm) {
|
||||||
ast_error(n, "cannot mix bytes and nonbytes"
|
ast_error(n, "cannot mix bytes and nonbytes literals");
|
||||||
"literals");
|
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
if (PyString_Check(v) && PyString_Check(s)) {
|
if (PyString_Check(v) && PyString_Check(s)) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue