Return complete lines from codec stream readers

even if there is an exception in later lines, resulting in
correct line numbers for decoding errors in source code. Fixes #1178484.
Will backport to 2.4.
This commit is contained in:
Martin v. Löwis 2005-08-24 07:38:12 +00:00
parent 6d2b346140
commit 56066d2e55
3 changed files with 26 additions and 4 deletions

View file

@ -236,7 +236,7 @@ class StreamReader(Codec):
def decode(self, input, errors='strict'):
raise NotImplementedError
def read(self, size=-1, chars=-1):
def read(self, size=-1, chars=-1, firstline=False):
""" Decodes data from the stream self.stream and returns the
resulting object.
@ -253,6 +253,11 @@ class StreamReader(Codec):
is intended to prevent having to decode huge files in one
step.
If firstline is true, and a UnicodeDecodeError happens
after the first line terminator in the input only the first line
will be returned, the rest of the input will be kept until the
next call to read().
The method should use a greedy read strategy meaning that
it should read as much data as is allowed within the
definition of the encoding and the given size, e.g. if
@ -275,7 +280,16 @@ class StreamReader(Codec):
newdata = self.stream.read(size)
# decode bytes (those remaining from the last call included)
data = self.bytebuffer + newdata
newchars, decodedbytes = self.decode(data, self.errors)
try:
newchars, decodedbytes = self.decode(data, self.errors)
except UnicodeDecodeError, exc:
if firstline:
newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
lines = newchars.splitlines(True)
if len(lines)<=1:
raise
else:
raise
# keep undecoded bytes until the next call
self.bytebuffer = data[decodedbytes:]
# put new characters in the character buffer
@ -306,7 +320,7 @@ class StreamReader(Codec):
line = ""
# If size is given, we call read() only once
while True:
data = self.read(readsize)
data = self.read(readsize, firstline=True)
if data:
# If we're at a "\r" read one extra character (which might
# be a "\n") to get a proper line ending. If the stream is