Return complete lines from codec stream readers

even if there is an exception in later lines, resulting in correct line numbers for decoding errors in source code. Fixes #1178484. Will backport to 2.4.
2025-09-26 10:19:53 +00:00 · 2005-08-24 07:38:12 +00:00 · 2005-08-24 07:38:12 +00:00 · 56066d2e55
commit 56066d2e55
parent 6d2b346140
3 changed files with 26 additions and 4 deletions
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@ -236,7 +236,7 @@ class StreamReader(Codec):
    def decode(self, input, errors='strict'):
        raise NotImplementedError

-    def read(self, size=-1, chars=-1):
+    def read(self, size=-1, chars=-1, firstline=False):

        """ Decodes data from the stream self.stream and returns the
            resulting object.
@ -253,6 +253,11 @@ class StreamReader(Codec):
            is intended to prevent having to decode huge files in one
            step.

+            If firstline is true, and a UnicodeDecodeError happens
+            after the first line terminator in the input only the first line
+            will be returned, the rest of the input will be kept until the
+            next call to read().
+
            The method should use a greedy read strategy meaning that
            it should read as much data as is allowed within the
            definition of the encoding and the given size, e.g.  if
@ -275,7 +280,16 @@ class StreamReader(Codec):
                newdata = self.stream.read(size)
            # decode bytes (those remaining from the last call included)
            data = self.bytebuffer + newdata
-            newchars, decodedbytes = self.decode(data, self.errors)
+            try:
+                newchars, decodedbytes = self.decode(data, self.errors)
+            except UnicodeDecodeError, exc:
+                if firstline:
+                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
+                    lines = newchars.splitlines(True)
+                    if len(lines)<=1:
+                        raise
+                else:
+                    raise
            # keep undecoded bytes until the next call
            self.bytebuffer = data[decodedbytes:]
            # put new characters in the character buffer
@ -306,7 +320,7 @@ class StreamReader(Codec):
        line = ""
        # If size is given, we call read() only once
        while True:
-            data = self.read(readsize)
+            data = self.read(readsize, firstline=True)
            if data:
                # If we're at a "\r" read one extra character (which might
                # be a "\n") to get a proper line ending. If the stream is