bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)

(cherry picked from commit 694d31e714)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2020-07-01 08:29:30 -07:00 committed by GitHub
parent c20e83dce0
commit fe0175f5b5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,10 +1,8 @@
import codecs
from codecs import BOM_UTF8
import os import os
import re
import shlex import shlex
import sys import sys
import tempfile import tempfile
import tokenize
import tkinter.filedialog as tkFileDialog import tkinter.filedialog as tkFileDialog
import tkinter.messagebox as tkMessageBox import tkinter.messagebox as tkMessageBox
@ -20,49 +18,6 @@ else:
errors = 'surrogateescape' errors = 'surrogateescape'
coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
def coding_spec(data):
"""Return the encoding declaration according to PEP 263.
When checking encoded data, only the first two lines should be passed
in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
The first two lines would contain the encoding specification.
Raise a LookupError if the encoding is declared but unknown.
"""
if isinstance(data, bytes):
# This encoding might be wrong. However, the coding
# spec must be ASCII-only, so any non-ASCII characters
# around here will be ignored. Decoding to Latin-1 should
# never fail (except for memory outage)
lines = data.decode('iso-8859-1')
else:
lines = data
# consider only the first two lines
if '\n' in lines:
lst = lines.split('\n', 2)[:2]
elif '\r' in lines:
lst = lines.split('\r', 2)[:2]
else:
lst = [lines]
for line in lst:
match = coding_re.match(line)
if match is not None:
break
if not blank_re.match(line):
return None
else:
return None
name = match.group(1)
try:
codecs.lookup(name)
except LookupError:
# The standard encoding error does not indicate the encoding
raise LookupError("Unknown encoding: "+name)
return name
class IOBinding: class IOBinding:
# One instance per editor Window so methods know which to save, close. # One instance per editor Window so methods know which to save, close.
@ -78,7 +33,7 @@ class IOBinding:
self.save_as) self.save_as)
self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>", self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
self.save_a_copy) self.save_a_copy)
self.fileencoding = None self.fileencoding = 'utf-8'
self.__id_print = self.text.bind("<<print-window>>", self.print_window) self.__id_print = self.text.bind("<<print-window>>", self.print_window)
def close(self): def close(self):
@ -165,34 +120,44 @@ class IOBinding:
self.text.focus_set() self.text.focus_set()
return "break" return "break"
eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
eol_re = re.compile(eol)
eol_convention = os.linesep # default eol_convention = os.linesep # default
def loadfile(self, filename): def loadfile(self, filename):
try: try:
# open the file in binary mode so that we can handle try:
# end-of-line convention ourselves. with tokenize.open(filename) as f:
with open(filename, 'rb') as f: chars = f.read()
two_lines = f.readline() + f.readline() fileencoding = f.encoding
f.seek(0) eol_convention = f.newlines
bytes = f.read() converted = False
except OSError as msg: except (UnicodeDecodeError, SyntaxError):
tkMessageBox.showerror("I/O Error", str(msg), parent=self.text) # Wait for the editor window to appear
self.editwin.text.update()
enc = askstring(
"Specify file encoding",
"The file's encoding is invalid for Python 3.x.\n"
"IDLE will convert it to UTF-8.\n"
"What is the current encoding of the file?",
initialvalue='utf-8',
parent=self.editwin.text)
with open(filename, encoding=enc) as f:
chars = f.read()
fileencoding = f.encoding
eol_convention = f.newlines
converted = True
except OSError as err:
tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
return False return False
chars, converted = self._decode(two_lines, bytes) except UnicodeDecodeError:
if chars is None:
tkMessageBox.showerror("Decoding Error", tkMessageBox.showerror("Decoding Error",
"File %s\nFailed to Decode" % filename, "File %s\nFailed to Decode" % filename,
parent=self.text) parent=self.text)
return False return False
# We now convert all end-of-lines to '\n's
firsteol = self.eol_re.search(chars)
if firsteol:
self.eol_convention = firsteol.group(0)
chars = self.eol_re.sub(r"\n", chars)
self.text.delete("1.0", "end") self.text.delete("1.0", "end")
self.set_filename(None) self.set_filename(None)
self.fileencoding = fileencoding
self.eol_convention = eol_convention
self.text.insert("1.0", chars) self.text.insert("1.0", chars)
self.reset_undo() self.reset_undo()
self.set_filename(filename) self.set_filename(filename)
@ -205,74 +170,6 @@ class IOBinding:
self.updaterecentfileslist(filename) self.updaterecentfileslist(filename)
return True return True
def _decode(self, two_lines, bytes):
"Create a Unicode string."
chars = None
# Check presence of a UTF-8 signature first
if bytes.startswith(BOM_UTF8):
try:
chars = bytes[3:].decode("utf-8")
except UnicodeDecodeError:
# has UTF-8 signature, but fails to decode...
return None, False
else:
# Indicates that this file originally had a BOM
self.fileencoding = 'BOM'
return chars, False
# Next look for coding specification
try:
enc = coding_spec(two_lines)
except LookupError as name:
tkMessageBox.showerror(
title="Error loading the file",
message="The encoding '%s' is not known to this Python "\
"installation. The file may not display correctly" % name,
parent = self.text)
enc = None
except UnicodeDecodeError:
return None, False
if enc:
try:
chars = str(bytes, enc)
self.fileencoding = enc
return chars, False
except UnicodeDecodeError:
pass
# Try ascii:
try:
chars = str(bytes, 'ascii')
self.fileencoding = None
return chars, False
except UnicodeDecodeError:
pass
# Try utf-8:
try:
chars = str(bytes, 'utf-8')
self.fileencoding = 'utf-8'
return chars, False
except UnicodeDecodeError:
pass
# Finally, try the locale's encoding. This is deprecated;
# the user should declare a non-ASCII encoding
try:
# Wait for the editor window to appear
self.editwin.text.update()
enc = askstring(
"Specify file encoding",
"The file's encoding is invalid for Python 3.x.\n"
"IDLE will convert it to UTF-8.\n"
"What is the current encoding of the file?",
initialvalue = encoding,
parent = self.editwin.text)
if enc:
chars = str(bytes, enc)
self.fileencoding = None
return chars, True
except (UnicodeDecodeError, LookupError):
pass
return None, False # None on failure
def maybesave(self): def maybesave(self):
if self.get_saved(): if self.get_saved():
return "yes" return "yes"
@ -360,30 +257,22 @@ class IOBinding:
# text to us. Don't try to guess further. # text to us. Don't try to guess further.
return chars return chars
# Preserve a BOM that might have been present on opening # Preserve a BOM that might have been present on opening
if self.fileencoding == 'BOM': if self.fileencoding == 'utf-8-sig':
return BOM_UTF8 + chars.encode("utf-8") return chars.encode('utf-8-sig')
# See whether there is anything non-ASCII in it. # See whether there is anything non-ASCII in it.
# If not, no need to figure out the encoding. # If not, no need to figure out the encoding.
try: try:
return chars.encode('ascii') return chars.encode('ascii')
except UnicodeError: except UnicodeEncodeError:
pass pass
# Check if there is an encoding declared # Check if there is an encoding declared
try: try:
# a string, let coding_spec slice it to the first two lines encoded = chars.encode('ascii', 'replace')
enc = coding_spec(chars) enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
failed = None
except LookupError as msg:
failed = msg
enc = None
else:
if not enc:
# PEP 3120: default source encoding is UTF-8
enc = 'utf-8'
if enc:
try:
return chars.encode(enc) return chars.encode(enc)
except UnicodeError: except SyntaxError as err:
failed = str(err)
except UnicodeEncodeError:
failed = "Invalid encoding '%s'" % enc failed = "Invalid encoding '%s'" % enc
tkMessageBox.showerror( tkMessageBox.showerror(
"I/O Error", "I/O Error",
@ -391,7 +280,7 @@ class IOBinding:
parent=self.text) parent=self.text)
# Fallback: save as UTF-8, with BOM - ignoring the incorrect # Fallback: save as UTF-8, with BOM - ignoring the incorrect
# declared encoding # declared encoding
return BOM_UTF8 + chars.encode("utf-8") return chars.encode('utf-8-sig')
def print_window(self, event): def print_window(self, event):
confirm = tkMessageBox.askokcancel( confirm = tkMessageBox.askokcancel(