bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)

(cherry picked from commit 694d31e714) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-09-26 18:29:57 +00:00 · 2020-07-01 08:29:30 -07:00 · 2020-07-01 08:29:30 -07:00 · fe0175f5b5
commit fe0175f5b5
parent c20e83dce0
1 changed files with 41 additions and 152 deletions
--- a/Lib/idlelib/iomenu.py
+++ b/Lib/idlelib/iomenu.py
@ -1,10 +1,8 @@
 import codecs
 from codecs import BOM_UTF8
 import os
 import re
 import shlex
 import sys
 import tempfile
 import tokenize
 import tkinter.filedialog as tkFileDialog
 import tkinter.messagebox as tkMessageBox
@ -20,49 +18,6 @@ else:
    errors = 'surrogateescape'
 coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 def coding_spec(data):
    """Return the encoding declaration according to PEP 263.
    When checking encoded data, only the first two lines should be passed
    in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
    The first two lines would contain the encoding specification.
    Raise a LookupError if the encoding is declared but unknown.
    """
    if isinstance(data, bytes):
        # This encoding might be wrong. However, the coding
        # spec must be ASCII-only, so any non-ASCII characters
        # around here will be ignored. Decoding to Latin-1 should
        # never fail (except for memory outage)
        lines = data.decode('iso-8859-1')
    else:
        lines = data
    # consider only the first two lines
    if '\n' in lines:
        lst = lines.split('\n', 2)[:2]
    elif '\r' in lines:
        lst = lines.split('\r', 2)[:2]
    else:
        lst = [lines]
    for line in lst:
        match = coding_re.match(line)
        if match is not None:
            break
        if not blank_re.match(line):
            return None
    else:
        return None
    name = match.group(1)
    try:
        codecs.lookup(name)
    except LookupError:
        # The standard encoding error does not indicate the encoding
        raise LookupError("Unknown encoding: "+name)
    return name
 class IOBinding:
 # One instance per editor Window so methods know which to save, close.
@ -78,7 +33,7 @@ class IOBinding:
                                          self.save_as)
        self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
                                            self.save_a_copy)
-        self.fileencoding = None
+        self.fileencoding = 'utf-8'
        self.__id_print = self.text.bind("<<print-window>>", self.print_window)
    def close(self):
@ -165,34 +120,44 @@ class IOBinding:
            self.text.focus_set()
        return "break"
    eol = r"(\r\n)|\n|\r"  # \r\n (Windows), \n (UNIX), or \r (Mac)
    eol_re = re.compile(eol)
    eol_convention = os.linesep  # default
    def loadfile(self, filename):
        try:
-            # open the file in binary mode so that we can handle
+            try:
-            # end-of-line convention ourselves.
+                with tokenize.open(filename) as f:
-            with open(filename, 'rb') as f:
+                    chars = f.read()
-                two_lines = f.readline() + f.readline()
+                    fileencoding = f.encoding
-                f.seek(0)
+                    eol_convention = f.newlines
-                bytes = f.read()
+                    converted = False
-        except OSError as msg:
+            except (UnicodeDecodeError, SyntaxError):
-            tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
+                # Wait for the editor window to appear
                self.editwin.text.update()
                enc = askstring(
                    "Specify file encoding",
                    "The file's encoding is invalid for Python 3.x.\n"
                    "IDLE will convert it to UTF-8.\n"
                    "What is the current encoding of the file?",
                    initialvalue='utf-8',
                    parent=self.editwin.text)
                with open(filename, encoding=enc) as f:
                    chars = f.read()
                    fileencoding = f.encoding
                    eol_convention = f.newlines
                    converted = True
        except OSError as err:
            tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
            return False
-        chars, converted = self._decode(two_lines, bytes)
+        except UnicodeDecodeError:
        if chars is None:
            tkMessageBox.showerror("Decoding Error",
                                   "File %s\nFailed to Decode" % filename,
                                   parent=self.text)
            return False
-        # We now convert all end-of-lines to '\n's
+
        firsteol = self.eol_re.search(chars)
        if firsteol:
            self.eol_convention = firsteol.group(0)
            chars = self.eol_re.sub(r"\n", chars)
        self.text.delete("1.0", "end")
        self.set_filename(None)
        self.fileencoding = fileencoding
        self.eol_convention = eol_convention
        self.text.insert("1.0", chars)
        self.reset_undo()
        self.set_filename(filename)
@ -205,74 +170,6 @@ class IOBinding:
        self.updaterecentfileslist(filename)
        return True
    def _decode(self, two_lines, bytes):
        "Create a Unicode string."
        chars = None
        # Check presence of a UTF-8 signature first
        if bytes.startswith(BOM_UTF8):
            try:
                chars = bytes[3:].decode("utf-8")
            except UnicodeDecodeError:
                # has UTF-8 signature, but fails to decode...
                return None, False
            else:
                # Indicates that this file originally had a BOM
                self.fileencoding = 'BOM'
                return chars, False
        # Next look for coding specification
        try:
            enc = coding_spec(two_lines)
        except LookupError as name:
            tkMessageBox.showerror(
                title="Error loading the file",
                message="The encoding '%s' is not known to this Python "\
                "installation. The file may not display correctly" % name,
                parent = self.text)
            enc = None
        except UnicodeDecodeError:
            return None, False
        if enc:
            try:
                chars = str(bytes, enc)
                self.fileencoding = enc
                return chars, False
            except UnicodeDecodeError:
                pass
        # Try ascii:
        try:
            chars = str(bytes, 'ascii')
            self.fileencoding = None
            return chars, False
        except UnicodeDecodeError:
            pass
        # Try utf-8:
        try:
            chars = str(bytes, 'utf-8')
            self.fileencoding = 'utf-8'
            return chars, False
        except UnicodeDecodeError:
            pass
        # Finally, try the locale's encoding. This is deprecated;
        # the user should declare a non-ASCII encoding
        try:
            # Wait for the editor window to appear
            self.editwin.text.update()
            enc = askstring(
                "Specify file encoding",
                "The file's encoding is invalid for Python 3.x.\n"
                "IDLE will convert it to UTF-8.\n"
                "What is the current encoding of the file?",
                initialvalue = encoding,
                parent = self.editwin.text)
            if enc:
                chars = str(bytes, enc)
                self.fileencoding = None
            return chars, True
        except (UnicodeDecodeError, LookupError):
            pass
        return None, False  # None on failure
    def maybesave(self):
        if self.get_saved():
            return "yes"
@ -360,30 +257,22 @@ class IOBinding:
            # text to us. Don't try to guess further.
            return chars
        # Preserve a BOM that might have been present on opening
-        if self.fileencoding == 'BOM':
+        if self.fileencoding == 'utf-8-sig':
-            return BOM_UTF8 + chars.encode("utf-8")
+            return chars.encode('utf-8-sig')
        # See whether there is anything non-ASCII in it.
        # If not, no need to figure out the encoding.
        try:
            return chars.encode('ascii')
-        except UnicodeError:
+        except UnicodeEncodeError:
            pass
        # Check if there is an encoding declared
        try:
-            # a string, let coding_spec slice it to the first two lines
+            encoded = chars.encode('ascii', 'replace')
-            enc = coding_spec(chars)
+            enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
            failed = None
        except LookupError as msg:
            failed = msg
            enc = None
        else:
            if not enc:
                # PEP 3120: default source encoding is UTF-8
                enc = 'utf-8'
        if enc:
            try:
            return chars.encode(enc)
-            except UnicodeError:
+        except SyntaxError as err:
            failed = str(err)
        except UnicodeEncodeError:
            failed = "Invalid encoding '%s'" % enc
        tkMessageBox.showerror(
            "I/O Error",
@ -391,7 +280,7 @@ class IOBinding:
            parent=self.text)
        # Fallback: save as UTF-8, with BOM - ignoring the incorrect
        # declared encoding
-        return BOM_UTF8 + chars.encode("utf-8")
+        return chars.encode('utf-8-sig')
    def print_window(self, event):
        confirm = tkMessageBox.askokcancel(