Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library.

This means, for example, that opening an UTF-16 text file in append mode doesn't add a BOM at the end of the file if the file isn't empty.
2025-09-26 10:19:53 +00:00 · 2009-05-14 18:55:55 +00:00 · 2009-05-14 18:55:55 +00:00 · e450185b4a
commit e450185b4a
parent b565577aa7
6 changed files with 168 additions and 22 deletions
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@ -1436,6 +1436,15 @@ class TextIOWrapper(TextIOBase):
        self._snapshot = None  # info for reconstructing decoder state
        self._seekable = self._telling = self.buffer.seekable()

+        if self._seekable and self.writable():
+            position = self.buffer.tell()
+            if position != 0:
+                try:
+                    self._get_encoder().setstate(0)
+                except LookupError:
+                    # Sometimes the encoder doesn't exist
+                    pass
+
    # self._snapshot is either None, or a tuple (dec_flags, next_input)
    # where dec_flags is the second (integer) item of the decoder state
    # and next_input is the chunk of input bytes that comes next after the
@ -1741,6 +1750,17 @@ class TextIOWrapper(TextIOBase):
                raise IOError("can't restore logical file position")
            self._decoded_chars_used = chars_to_skip

+        # Finally, reset the encoder (merely useful for proper BOM handling)
+        try:
+            encoder = self._encoder or self._get_encoder()
+        except LookupError:
+            # Sometimes the encoder doesn't exist
+            pass
+        else:
+            if cookie != 0:
+                encoder.setstate(0)
+            else:
+                encoder.reset()
        return cookie

    def read(self, n=None):