mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does
This commit is contained in:
parent
ffc08fcad6
commit
d3afadaa49
3 changed files with 44 additions and 2 deletions
|
@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase):
|
||||||
b'do_something(else)\n'
|
b'do_something(else)\n'
|
||||||
)
|
)
|
||||||
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||||
self.assertEquals(encoding, 'latin-1')
|
self.assertEquals(encoding, 'iso-8859-1')
|
||||||
self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
|
self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
|
||||||
|
|
||||||
def test_matched_bom_and_cookie_first_line(self):
|
def test_matched_bom_and_cookie_first_line(self):
|
||||||
|
@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase):
|
||||||
readline = self.get_readline(lines)
|
readline = self.get_readline(lines)
|
||||||
self.assertRaises(SyntaxError, detect_encoding, readline)
|
self.assertRaises(SyntaxError, detect_encoding, readline)
|
||||||
|
|
||||||
|
def test_latin1_normalization(self):
|
||||||
|
# See get_normal_name() in tokenizer.c.
|
||||||
|
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
||||||
|
"iso-8859-1-unix", "iso-latin-1-mac")
|
||||||
|
for encoding in encodings:
|
||||||
|
for rep in ("-", "_"):
|
||||||
|
enc = encoding.replace("-", rep)
|
||||||
|
lines = (b"#!/usr/bin/python\n",
|
||||||
|
b"# coding: " + enc.encode("ascii") + b"\n",
|
||||||
|
b"print(things)\n",
|
||||||
|
b"do_something += 4\n")
|
||||||
|
rl = self.get_readline(lines)
|
||||||
|
found, consumed_lines = detect_encoding(rl)
|
||||||
|
self.assertEquals(found, "iso-8859-1")
|
||||||
|
|
||||||
|
def test_utf8_normalization(self):
|
||||||
|
# See get_normal_name() in tokenizer.c.
|
||||||
|
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
|
||||||
|
for encoding in encodings:
|
||||||
|
for rep in ("-", "_"):
|
||||||
|
enc = encoding.replace("-", rep)
|
||||||
|
lines = (b"#!/usr/bin/python\n",
|
||||||
|
b"# coding: " + enc.encode("ascii") + b"\n",
|
||||||
|
b"1 + 3\n")
|
||||||
|
rl = self.get_readline(lines)
|
||||||
|
found, consumed_lines = detect_encoding(rl)
|
||||||
|
self.assertEquals(found, "utf-8")
|
||||||
|
|
||||||
def test_short_files(self):
|
def test_short_files(self):
|
||||||
readline = self.get_readline((b'print(something)\n',))
|
readline = self.get_readline((b'print(something)\n',))
|
||||||
encoding, consumed_lines = detect_encoding(readline)
|
encoding, consumed_lines = detect_encoding(readline)
|
||||||
|
|
|
@ -279,6 +279,17 @@ def untokenize(iterable):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _get_normal_name(orig_enc):
|
||||||
|
"""Imitates get_normal_name in tokenizer.c."""
|
||||||
|
# Only care about the first 12 characters.
|
||||||
|
enc = orig_enc[:12].lower().replace("_", "-")
|
||||||
|
if enc == "utf-8" or enc.startswith("utf-8-"):
|
||||||
|
return "utf-8"
|
||||||
|
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
|
||||||
|
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
|
||||||
|
return "iso-8859-1"
|
||||||
|
return orig_enc
|
||||||
|
|
||||||
def detect_encoding(readline):
|
def detect_encoding(readline):
|
||||||
"""
|
"""
|
||||||
The detect_encoding() function is used to detect the encoding that should
|
The detect_encoding() function is used to detect the encoding that should
|
||||||
|
@ -313,7 +324,7 @@ def detect_encoding(readline):
|
||||||
matches = cookie_re.findall(line_string)
|
matches = cookie_re.findall(line_string)
|
||||||
if not matches:
|
if not matches:
|
||||||
return None
|
return None
|
||||||
encoding = matches[0]
|
encoding = _get_normal_name(matches[0])
|
||||||
try:
|
try:
|
||||||
codec = lookup(encoding)
|
codec = lookup(encoding)
|
||||||
except LookupError:
|
except LookupError:
|
||||||
|
|
|
@ -87,6 +87,9 @@ C-API
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Make tokenize.detect_coding() normalize utf-8 and iso-8859-1 variants like the
|
||||||
|
builtin tokenizer.
|
||||||
|
|
||||||
- Issue #7048: Force Decimal.logb to round its result when that result
|
- Issue #7048: Force Decimal.logb to round its result when that result
|
||||||
is too large to fit in the current precision.
|
is too large to fit in the current precision.
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue