mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Issue #18960: Fix bugs with Python source code encoding in the second line.
* The first line of Python script could be executed twice when the source encoding (not equal to 'utf-8') was specified on the second line. * Now the source encoding declaration on the second line isn't effective if the first line contains anything except a comment. * As a consequence, 'python -x' works now again with files with the source encoding declarations specified on the second file, and can be used again to make Python batch files on Windows. * The tokenize module now ignore the source encoding declaration on the second line if the first line contains anything except a comment. * IDLE now ignores the source encoding declaration on the second line if the first line contains anything except a comment. * 2to3 and the findnocoding.py script now ignore the source encoding declaration on the second line if the first line contains anything except a comment.
This commit is contained in:
parent
21e7d4cd5e
commit
768c16ce02
7 changed files with 87 additions and 5 deletions
|
@ -64,6 +64,7 @@ encoding = locale_encoding ### KBK 07Sep07 This is used all over IDLE, check!
|
||||||
### 'encoding' is used below in encode(), check!
|
### 'encoding' is used below in encode(), check!
|
||||||
|
|
||||||
coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||||
|
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||||
|
|
||||||
def coding_spec(data):
|
def coding_spec(data):
|
||||||
"""Return the encoding declaration according to PEP 263.
|
"""Return the encoding declaration according to PEP 263.
|
||||||
|
@ -93,6 +94,8 @@ def coding_spec(data):
|
||||||
match = coding_re.match(line)
|
match = coding_re.match(line)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
break
|
break
|
||||||
|
if not blank_re.match(line):
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
name = match.group(1)
|
name = match.group(1)
|
||||||
|
|
|
@ -237,6 +237,7 @@ class Untokenizer:
|
||||||
toks_append(tokval)
|
toks_append(tokval)
|
||||||
|
|
||||||
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||||
|
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||||
|
|
||||||
def _get_normal_name(orig_enc):
|
def _get_normal_name(orig_enc):
|
||||||
"""Imitates get_normal_name in tokenizer.c."""
|
"""Imitates get_normal_name in tokenizer.c."""
|
||||||
|
@ -309,6 +310,8 @@ def detect_encoding(readline):
|
||||||
encoding = find_cookie(first)
|
encoding = find_cookie(first)
|
||||||
if encoding:
|
if encoding:
|
||||||
return encoding, [first]
|
return encoding, [first]
|
||||||
|
if not blank_re.match(first):
|
||||||
|
return default, [first]
|
||||||
|
|
||||||
second = read_or_stop()
|
second = read_or_stop()
|
||||||
if not second:
|
if not second:
|
||||||
|
|
|
@ -885,6 +885,39 @@ class TestDetectEncoding(TestCase):
|
||||||
readline = self.get_readline(lines)
|
readline = self.get_readline(lines)
|
||||||
self.assertRaises(SyntaxError, detect_encoding, readline)
|
self.assertRaises(SyntaxError, detect_encoding, readline)
|
||||||
|
|
||||||
|
def test_cookie_second_line_noncommented_first_line(self):
|
||||||
|
lines = (
|
||||||
|
b"print('\xc2\xa3')\n",
|
||||||
|
b'# vim: set fileencoding=iso8859-15 :\n',
|
||||||
|
b"print('\xe2\x82\xac')\n"
|
||||||
|
)
|
||||||
|
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||||
|
self.assertEqual(encoding, 'utf-8')
|
||||||
|
expected = [b"print('\xc2\xa3')\n"]
|
||||||
|
self.assertEqual(consumed_lines, expected)
|
||||||
|
|
||||||
|
def test_cookie_second_line_commented_first_line(self):
|
||||||
|
lines = (
|
||||||
|
b"#print('\xc2\xa3')\n",
|
||||||
|
b'# vim: set fileencoding=iso8859-15 :\n',
|
||||||
|
b"print('\xe2\x82\xac')\n"
|
||||||
|
)
|
||||||
|
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||||
|
self.assertEqual(encoding, 'iso8859-15')
|
||||||
|
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
|
||||||
|
self.assertEqual(consumed_lines, expected)
|
||||||
|
|
||||||
|
def test_cookie_second_line_empty_first_line(self):
|
||||||
|
lines = (
|
||||||
|
b'\n',
|
||||||
|
b'# vim: set fileencoding=iso8859-15 :\n',
|
||||||
|
b"print('\xe2\x82\xac')\n"
|
||||||
|
)
|
||||||
|
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||||
|
self.assertEqual(encoding, 'iso8859-15')
|
||||||
|
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
|
||||||
|
self.assertEqual(consumed_lines, expected)
|
||||||
|
|
||||||
def test_latin1_normalization(self):
|
def test_latin1_normalization(self):
|
||||||
# See get_normal_name() in tokenizer.c.
|
# See get_normal_name() in tokenizer.c.
|
||||||
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
||||||
|
|
|
@ -32,6 +32,7 @@ from codecs import lookup, BOM_UTF8
|
||||||
import collections
|
import collections
|
||||||
from io import TextIOWrapper
|
from io import TextIOWrapper
|
||||||
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||||
|
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||||
|
|
||||||
import token
|
import token
|
||||||
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
|
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
|
||||||
|
@ -409,6 +410,8 @@ def detect_encoding(readline):
|
||||||
encoding = find_cookie(first)
|
encoding = find_cookie(first)
|
||||||
if encoding:
|
if encoding:
|
||||||
return encoding, [first]
|
return encoding, [first]
|
||||||
|
if not blank_re.match(first):
|
||||||
|
return default, [first]
|
||||||
|
|
||||||
second = read_or_stop()
|
second = read_or_stop()
|
||||||
if not second:
|
if not second:
|
||||||
|
|
20
Misc/NEWS
20
Misc/NEWS
|
@ -10,6 +10,13 @@ What's New in Python 3.3.4 release candidate 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #18960: The first line of Python script could be executed twice when
|
||||||
|
the source encoding was specified on the second line. Now the source encoding
|
||||||
|
declaration on the second line isn't effective if the first line contains
|
||||||
|
anything except a comment. 'python -x' works now again with files with the
|
||||||
|
source encoding declarations, and can be used to make Python batch files
|
||||||
|
on Windows.
|
||||||
|
|
||||||
- Issue #19081: When a zipimport .zip file in sys.path being imported from
|
- Issue #19081: When a zipimport .zip file in sys.path being imported from
|
||||||
is modified during the lifetime of the Python process after zipimport has
|
is modified during the lifetime of the Python process after zipimport has
|
||||||
already cached the zip's table of contents we detect this and recover
|
already cached the zip's table of contents we detect this and recover
|
||||||
|
@ -36,6 +43,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #18960: The tokenize module now ignore the source encoding declaration
|
||||||
|
on the second line if the first line contains anything except a comment.
|
||||||
|
|
||||||
- Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU
|
- Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU
|
||||||
consumption.
|
consumption.
|
||||||
|
|
||||||
|
@ -204,6 +214,9 @@ Library
|
||||||
IDLE
|
IDLE
|
||||||
----
|
----
|
||||||
|
|
||||||
|
- Issue #18960: IDLE now ignores the source encoding declaration on the second
|
||||||
|
line if the first line contains anything except a comment.
|
||||||
|
|
||||||
- Issue #20058: sys.stdin.readline() in IDLE now always returns only one line.
|
- Issue #20058: sys.stdin.readline() in IDLE now always returns only one line.
|
||||||
|
|
||||||
- Issue #19481: print() of string subclass instance in IDLE no longer hangs.
|
- Issue #19481: print() of string subclass instance in IDLE no longer hangs.
|
||||||
|
@ -281,6 +294,13 @@ Build
|
||||||
- Add workaround for VS 2010 nmake clean issue. VS 2010 doesn't set up PATH
|
- Add workaround for VS 2010 nmake clean issue. VS 2010 doesn't set up PATH
|
||||||
for nmake.exe correctly.
|
for nmake.exe correctly.
|
||||||
|
|
||||||
|
Tools/Demos
|
||||||
|
-----------
|
||||||
|
|
||||||
|
- Issue #18960: 2to3 and the findnocoding.py script now ignore the source
|
||||||
|
encoding declaration on the second line if the first line contains anything
|
||||||
|
except a comment.
|
||||||
|
|
||||||
|
|
||||||
What's New in Python 3.3.3?
|
What's New in Python 3.3.3?
|
||||||
===========================
|
===========================
|
||||||
|
|
|
@ -283,13 +283,27 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
|
||||||
char *cs;
|
char *cs;
|
||||||
int r = 1;
|
int r = 1;
|
||||||
|
|
||||||
if (tok->cont_line)
|
if (tok->cont_line) {
|
||||||
/* It's a continuation line, so it can't be a coding spec. */
|
/* It's a continuation line, so it can't be a coding spec. */
|
||||||
|
tok->read_coding_spec = 1;
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
if (!get_coding_spec(line, &cs, size, tok))
|
if (!get_coding_spec(line, &cs, size, tok))
|
||||||
return 0;
|
return 0;
|
||||||
if (!cs)
|
if (!cs) {
|
||||||
|
Py_ssize_t i;
|
||||||
|
for (i = 0; i < size; i++) {
|
||||||
|
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
|
||||||
|
break;
|
||||||
|
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
|
||||||
|
/* Stop checking coding spec after a line containing
|
||||||
|
* anything except a comment. */
|
||||||
|
tok->read_coding_spec = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
tok->read_coding_spec = 1;
|
tok->read_coding_spec = 1;
|
||||||
if (tok->encoding == NULL) {
|
if (tok->encoding == NULL) {
|
||||||
assert(tok->decoding_state == STATE_RAW);
|
assert(tok->decoding_state == STATE_RAW);
|
||||||
|
@ -476,13 +490,17 @@ fp_setreadl(struct tok_state *tok, const char* enc)
|
||||||
_Py_IDENTIFIER(open);
|
_Py_IDENTIFIER(open);
|
||||||
_Py_IDENTIFIER(readline);
|
_Py_IDENTIFIER(readline);
|
||||||
int fd;
|
int fd;
|
||||||
|
long pos;
|
||||||
|
|
||||||
io = PyImport_ImportModuleNoBlock("io");
|
io = PyImport_ImportModuleNoBlock("io");
|
||||||
if (io == NULL)
|
if (io == NULL)
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
|
||||||
fd = fileno(tok->fp);
|
fd = fileno(tok->fp);
|
||||||
if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
|
/* Due to buffering the file offset for fd can be different from the file
|
||||||
|
* position of tok->fp. */
|
||||||
|
pos = ftell(tok->fp);
|
||||||
|
if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
|
||||||
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
|
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
@ -751,7 +769,7 @@ decode_str(const char *input, int single, struct tok_state *tok)
|
||||||
if (newl[0]) {
|
if (newl[0]) {
|
||||||
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
|
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
|
||||||
return error_ret(tok);
|
return error_ret(tok);
|
||||||
if (tok->enc == NULL && newl[1]) {
|
if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
|
||||||
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
|
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
|
||||||
tok, buf_setreadl))
|
tok, buf_setreadl))
|
||||||
return error_ret(tok);
|
return error_ret(tok);
|
||||||
|
|
|
@ -33,6 +33,7 @@ except ImportError:
|
||||||
|
|
||||||
|
|
||||||
decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)')
|
decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)')
|
||||||
|
blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)')
|
||||||
|
|
||||||
def get_declaration(line):
|
def get_declaration(line):
|
||||||
match = decl_re.match(line)
|
match = decl_re.match(line)
|
||||||
|
@ -58,7 +59,8 @@ def needs_declaration(fullpath):
|
||||||
line1 = infile.readline()
|
line1 = infile.readline()
|
||||||
line2 = infile.readline()
|
line2 = infile.readline()
|
||||||
|
|
||||||
if get_declaration(line1) or get_declaration(line2):
|
if (get_declaration(line1) or
|
||||||
|
blank_re.match(line1) and get_declaration(line2)):
|
||||||
# the file does have an encoding declaration, so trust it
|
# the file does have an encoding declaration, so trust it
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue