mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)
(cherry picked from commit 694d31e714
)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
c20e83dce0
commit
fe0175f5b5
1 changed files with 41 additions and 152 deletions
|
@ -1,10 +1,8 @@
|
||||||
import codecs
|
|
||||||
from codecs import BOM_UTF8
|
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import shlex
|
import shlex
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import tokenize
|
||||||
|
|
||||||
import tkinter.filedialog as tkFileDialog
|
import tkinter.filedialog as tkFileDialog
|
||||||
import tkinter.messagebox as tkMessageBox
|
import tkinter.messagebox as tkMessageBox
|
||||||
|
@ -20,49 +18,6 @@ else:
|
||||||
errors = 'surrogateescape'
|
errors = 'surrogateescape'
|
||||||
|
|
||||||
|
|
||||||
coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
|
||||||
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
|
||||||
|
|
||||||
def coding_spec(data):
|
|
||||||
"""Return the encoding declaration according to PEP 263.
|
|
||||||
|
|
||||||
When checking encoded data, only the first two lines should be passed
|
|
||||||
in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
|
|
||||||
The first two lines would contain the encoding specification.
|
|
||||||
|
|
||||||
Raise a LookupError if the encoding is declared but unknown.
|
|
||||||
"""
|
|
||||||
if isinstance(data, bytes):
|
|
||||||
# This encoding might be wrong. However, the coding
|
|
||||||
# spec must be ASCII-only, so any non-ASCII characters
|
|
||||||
# around here will be ignored. Decoding to Latin-1 should
|
|
||||||
# never fail (except for memory outage)
|
|
||||||
lines = data.decode('iso-8859-1')
|
|
||||||
else:
|
|
||||||
lines = data
|
|
||||||
# consider only the first two lines
|
|
||||||
if '\n' in lines:
|
|
||||||
lst = lines.split('\n', 2)[:2]
|
|
||||||
elif '\r' in lines:
|
|
||||||
lst = lines.split('\r', 2)[:2]
|
|
||||||
else:
|
|
||||||
lst = [lines]
|
|
||||||
for line in lst:
|
|
||||||
match = coding_re.match(line)
|
|
||||||
if match is not None:
|
|
||||||
break
|
|
||||||
if not blank_re.match(line):
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
name = match.group(1)
|
|
||||||
try:
|
|
||||||
codecs.lookup(name)
|
|
||||||
except LookupError:
|
|
||||||
# The standard encoding error does not indicate the encoding
|
|
||||||
raise LookupError("Unknown encoding: "+name)
|
|
||||||
return name
|
|
||||||
|
|
||||||
|
|
||||||
class IOBinding:
|
class IOBinding:
|
||||||
# One instance per editor Window so methods know which to save, close.
|
# One instance per editor Window so methods know which to save, close.
|
||||||
|
@ -78,7 +33,7 @@ class IOBinding:
|
||||||
self.save_as)
|
self.save_as)
|
||||||
self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
|
self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
|
||||||
self.save_a_copy)
|
self.save_a_copy)
|
||||||
self.fileencoding = None
|
self.fileencoding = 'utf-8'
|
||||||
self.__id_print = self.text.bind("<<print-window>>", self.print_window)
|
self.__id_print = self.text.bind("<<print-window>>", self.print_window)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
@ -165,34 +120,44 @@ class IOBinding:
|
||||||
self.text.focus_set()
|
self.text.focus_set()
|
||||||
return "break"
|
return "break"
|
||||||
|
|
||||||
eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
|
|
||||||
eol_re = re.compile(eol)
|
|
||||||
eol_convention = os.linesep # default
|
eol_convention = os.linesep # default
|
||||||
|
|
||||||
def loadfile(self, filename):
|
def loadfile(self, filename):
|
||||||
try:
|
try:
|
||||||
# open the file in binary mode so that we can handle
|
try:
|
||||||
# end-of-line convention ourselves.
|
with tokenize.open(filename) as f:
|
||||||
with open(filename, 'rb') as f:
|
chars = f.read()
|
||||||
two_lines = f.readline() + f.readline()
|
fileencoding = f.encoding
|
||||||
f.seek(0)
|
eol_convention = f.newlines
|
||||||
bytes = f.read()
|
converted = False
|
||||||
except OSError as msg:
|
except (UnicodeDecodeError, SyntaxError):
|
||||||
tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
|
# Wait for the editor window to appear
|
||||||
|
self.editwin.text.update()
|
||||||
|
enc = askstring(
|
||||||
|
"Specify file encoding",
|
||||||
|
"The file's encoding is invalid for Python 3.x.\n"
|
||||||
|
"IDLE will convert it to UTF-8.\n"
|
||||||
|
"What is the current encoding of the file?",
|
||||||
|
initialvalue='utf-8',
|
||||||
|
parent=self.editwin.text)
|
||||||
|
with open(filename, encoding=enc) as f:
|
||||||
|
chars = f.read()
|
||||||
|
fileencoding = f.encoding
|
||||||
|
eol_convention = f.newlines
|
||||||
|
converted = True
|
||||||
|
except OSError as err:
|
||||||
|
tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
|
||||||
return False
|
return False
|
||||||
chars, converted = self._decode(two_lines, bytes)
|
except UnicodeDecodeError:
|
||||||
if chars is None:
|
|
||||||
tkMessageBox.showerror("Decoding Error",
|
tkMessageBox.showerror("Decoding Error",
|
||||||
"File %s\nFailed to Decode" % filename,
|
"File %s\nFailed to Decode" % filename,
|
||||||
parent=self.text)
|
parent=self.text)
|
||||||
return False
|
return False
|
||||||
# We now convert all end-of-lines to '\n's
|
|
||||||
firsteol = self.eol_re.search(chars)
|
|
||||||
if firsteol:
|
|
||||||
self.eol_convention = firsteol.group(0)
|
|
||||||
chars = self.eol_re.sub(r"\n", chars)
|
|
||||||
self.text.delete("1.0", "end")
|
self.text.delete("1.0", "end")
|
||||||
self.set_filename(None)
|
self.set_filename(None)
|
||||||
|
self.fileencoding = fileencoding
|
||||||
|
self.eol_convention = eol_convention
|
||||||
self.text.insert("1.0", chars)
|
self.text.insert("1.0", chars)
|
||||||
self.reset_undo()
|
self.reset_undo()
|
||||||
self.set_filename(filename)
|
self.set_filename(filename)
|
||||||
|
@ -205,74 +170,6 @@ class IOBinding:
|
||||||
self.updaterecentfileslist(filename)
|
self.updaterecentfileslist(filename)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _decode(self, two_lines, bytes):
|
|
||||||
"Create a Unicode string."
|
|
||||||
chars = None
|
|
||||||
# Check presence of a UTF-8 signature first
|
|
||||||
if bytes.startswith(BOM_UTF8):
|
|
||||||
try:
|
|
||||||
chars = bytes[3:].decode("utf-8")
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
# has UTF-8 signature, but fails to decode...
|
|
||||||
return None, False
|
|
||||||
else:
|
|
||||||
# Indicates that this file originally had a BOM
|
|
||||||
self.fileencoding = 'BOM'
|
|
||||||
return chars, False
|
|
||||||
# Next look for coding specification
|
|
||||||
try:
|
|
||||||
enc = coding_spec(two_lines)
|
|
||||||
except LookupError as name:
|
|
||||||
tkMessageBox.showerror(
|
|
||||||
title="Error loading the file",
|
|
||||||
message="The encoding '%s' is not known to this Python "\
|
|
||||||
"installation. The file may not display correctly" % name,
|
|
||||||
parent = self.text)
|
|
||||||
enc = None
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
return None, False
|
|
||||||
if enc:
|
|
||||||
try:
|
|
||||||
chars = str(bytes, enc)
|
|
||||||
self.fileencoding = enc
|
|
||||||
return chars, False
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
# Try ascii:
|
|
||||||
try:
|
|
||||||
chars = str(bytes, 'ascii')
|
|
||||||
self.fileencoding = None
|
|
||||||
return chars, False
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
# Try utf-8:
|
|
||||||
try:
|
|
||||||
chars = str(bytes, 'utf-8')
|
|
||||||
self.fileencoding = 'utf-8'
|
|
||||||
return chars, False
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
# Finally, try the locale's encoding. This is deprecated;
|
|
||||||
# the user should declare a non-ASCII encoding
|
|
||||||
try:
|
|
||||||
# Wait for the editor window to appear
|
|
||||||
self.editwin.text.update()
|
|
||||||
enc = askstring(
|
|
||||||
"Specify file encoding",
|
|
||||||
"The file's encoding is invalid for Python 3.x.\n"
|
|
||||||
"IDLE will convert it to UTF-8.\n"
|
|
||||||
"What is the current encoding of the file?",
|
|
||||||
initialvalue = encoding,
|
|
||||||
parent = self.editwin.text)
|
|
||||||
|
|
||||||
if enc:
|
|
||||||
chars = str(bytes, enc)
|
|
||||||
self.fileencoding = None
|
|
||||||
return chars, True
|
|
||||||
except (UnicodeDecodeError, LookupError):
|
|
||||||
pass
|
|
||||||
return None, False # None on failure
|
|
||||||
|
|
||||||
def maybesave(self):
|
def maybesave(self):
|
||||||
if self.get_saved():
|
if self.get_saved():
|
||||||
return "yes"
|
return "yes"
|
||||||
|
@ -360,30 +257,22 @@ class IOBinding:
|
||||||
# text to us. Don't try to guess further.
|
# text to us. Don't try to guess further.
|
||||||
return chars
|
return chars
|
||||||
# Preserve a BOM that might have been present on opening
|
# Preserve a BOM that might have been present on opening
|
||||||
if self.fileencoding == 'BOM':
|
if self.fileencoding == 'utf-8-sig':
|
||||||
return BOM_UTF8 + chars.encode("utf-8")
|
return chars.encode('utf-8-sig')
|
||||||
# See whether there is anything non-ASCII in it.
|
# See whether there is anything non-ASCII in it.
|
||||||
# If not, no need to figure out the encoding.
|
# If not, no need to figure out the encoding.
|
||||||
try:
|
try:
|
||||||
return chars.encode('ascii')
|
return chars.encode('ascii')
|
||||||
except UnicodeError:
|
except UnicodeEncodeError:
|
||||||
pass
|
pass
|
||||||
# Check if there is an encoding declared
|
# Check if there is an encoding declared
|
||||||
try:
|
try:
|
||||||
# a string, let coding_spec slice it to the first two lines
|
encoded = chars.encode('ascii', 'replace')
|
||||||
enc = coding_spec(chars)
|
enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
|
||||||
failed = None
|
|
||||||
except LookupError as msg:
|
|
||||||
failed = msg
|
|
||||||
enc = None
|
|
||||||
else:
|
|
||||||
if not enc:
|
|
||||||
# PEP 3120: default source encoding is UTF-8
|
|
||||||
enc = 'utf-8'
|
|
||||||
if enc:
|
|
||||||
try:
|
|
||||||
return chars.encode(enc)
|
return chars.encode(enc)
|
||||||
except UnicodeError:
|
except SyntaxError as err:
|
||||||
|
failed = str(err)
|
||||||
|
except UnicodeEncodeError:
|
||||||
failed = "Invalid encoding '%s'" % enc
|
failed = "Invalid encoding '%s'" % enc
|
||||||
tkMessageBox.showerror(
|
tkMessageBox.showerror(
|
||||||
"I/O Error",
|
"I/O Error",
|
||||||
|
@ -391,7 +280,7 @@ class IOBinding:
|
||||||
parent=self.text)
|
parent=self.text)
|
||||||
# Fallback: save as UTF-8, with BOM - ignoring the incorrect
|
# Fallback: save as UTF-8, with BOM - ignoring the incorrect
|
||||||
# declared encoding
|
# declared encoding
|
||||||
return BOM_UTF8 + chars.encode("utf-8")
|
return chars.encode('utf-8-sig')
|
||||||
|
|
||||||
def print_window(self, event):
|
def print_window(self, event):
|
||||||
confirm = tkMessageBox.askokcancel(
|
confirm = tkMessageBox.askokcancel(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue