Merged revisions 67762 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r67762 | antoine.pitrou | 2008-12-14 18:40:51 +0100 (dim., 14 déc. 2008) | 3 lines

  Backport r67759 (fix io.IncrementalNewlineDecoder for UTF-16 et al.).
........
This commit is contained in:
Antoine Pitrou 2008-12-14 18:08:37 +00:00
parent 30327242b3
commit f8638a8d21
3 changed files with 89 additions and 56 deletions

View file

@ -1292,25 +1292,23 @@ class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
""" """
def __init__(self, decoder, translate, errors='strict'): def __init__(self, decoder, translate, errors='strict'):
codecs.IncrementalDecoder.__init__(self, errors=errors) codecs.IncrementalDecoder.__init__(self, errors=errors)
self.buffer = b''
self.translate = translate self.translate = translate
self.decoder = decoder self.decoder = decoder
self.seennl = 0 self.seennl = 0
self.pendingcr = False
def decode(self, input, final=False): def decode(self, input, final=False):
# decode input (with the eventual \r from a previous pass) # decode input (with the eventual \r from a previous pass)
if self.buffer:
input = self.buffer + input
output = self.decoder.decode(input, final=final) output = self.decoder.decode(input, final=final)
if self.pendingcr and (output or final):
output = "\r" + output
self.pendingcr = False
# retain last \r even when not translating data: # retain last \r even when not translating data:
# then readline() is sure to get \r\n in one pass # then readline() is sure to get \r\n in one pass
if output.endswith("\r") and not final: if output.endswith("\r") and not final:
output = output[:-1] output = output[:-1]
self.buffer = b'\r' self.pendingcr = True
else:
self.buffer = b''
# Record which newlines are read # Record which newlines are read
crlf = output.count('\r\n') crlf = output.count('\r\n')
@ -1329,20 +1327,19 @@ class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
def getstate(self): def getstate(self):
buf, flag = self.decoder.getstate() buf, flag = self.decoder.getstate()
return buf + self.buffer, flag flag <<= 1
if self.pendingcr:
flag |= 1
return buf, flag
def setstate(self, state): def setstate(self, state):
buf, flag = state buf, flag = state
if buf.endswith(b'\r'): self.pendingcr = bool(flag & 1)
self.buffer = b'\r' self.decoder.setstate((buf, flag >> 1))
buf = buf[:-1]
else:
self.buffer = b''
self.decoder.setstate((buf, flag))
def reset(self): def reset(self):
self.seennl = 0 self.seennl = 0
self.buffer = b'' self.pendingcr = False
self.decoder.reset() self.decoder.reset()
_LF = 1 _LF = 1

View file

@ -680,8 +680,9 @@ class StatefulIncrementalDecoder(codecs.IncrementalDecoder):
@classmethod @classmethod
def lookupTestDecoder(cls, name): def lookupTestDecoder(cls, name):
if cls.codecEnabled and name == 'test_decoder': if cls.codecEnabled and name == 'test_decoder':
latin1 = codecs.lookup('latin-1')
return codecs.CodecInfo( return codecs.CodecInfo(
name='test_decoder', encode=None, decode=None, name='test_decoder', encode=latin1.encode, decode=None,
incrementalencoder=None, incrementalencoder=None,
streamreader=None, streamwriter=None, streamreader=None, streamwriter=None,
incrementaldecoder=cls) incrementaldecoder=cls)
@ -840,8 +841,11 @@ class TextIOWrapperTest(unittest.TestCase):
[ '\r\n', [ "unix\nwindows\r\n", "os9\rlast\nnonl" ] ], [ '\r\n', [ "unix\nwindows\r\n", "os9\rlast\nnonl" ] ],
[ '\r', [ "unix\nwindows\r", "\nos9\r", "last\nnonl" ] ], [ '\r', [ "unix\nwindows\r", "\nos9\r", "last\nnonl" ] ],
] ]
encodings = (
encodings = ('utf-8', 'latin-1') 'utf-8', 'latin-1',
'utf-16', 'utf-16-le', 'utf-16-be',
'utf-32', 'utf-32-le', 'utf-32-be',
)
# Try a range of buffer sizes to test the case where \r is the last # Try a range of buffer sizes to test the case where \r is the last
# character in TextIOWrapper._pending_line. # character in TextIOWrapper._pending_line.
@ -1195,55 +1199,83 @@ class TextIOWrapperTest(unittest.TestCase):
self.assertEqual(buffer.seekable(), txt.seekable()) self.assertEqual(buffer.seekable(), txt.seekable())
def test_newline_decoder(self): def check_newline_decoder_utf8(self, decoder):
import codecs # UTF-8 specific tests for a newline decoder
decoder = codecs.getincrementaldecoder("utf-8")() def _check_decode(b, s, **kwargs):
decoder = io.IncrementalNewlineDecoder(decoder, translate=True) # We exercise getstate() / setstate() as well as decode()
state = decoder.getstate()
self.assertEquals(decoder.decode(b, **kwargs), s)
decoder.setstate(state)
self.assertEquals(decoder.decode(b, **kwargs), s)
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), u"\u8888") _check_decode(b'\xe8\xa2\x88', "\u8888")
self.assertEquals(decoder.decode(b'\xe8'), u"") _check_decode(b'\xe8', "")
self.assertEquals(decoder.decode(b'\xa2'), u"") _check_decode(b'\xa2', "")
self.assertEquals(decoder.decode(b'\x88'), u"\u8888") _check_decode(b'\x88', "\u8888")
self.assertEquals(decoder.decode(b'\xe8'), u"") _check_decode(b'\xe8', "")
_check_decode(b'\xa2', "")
_check_decode(b'\x88', "\u8888")
_check_decode(b'\xe8', "")
self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True) self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
decoder.setstate((b'', 0)) decoder.reset()
self.assertEquals(decoder.decode(b'\n'), u"\n") _check_decode(b'\n', "\n")
self.assertEquals(decoder.decode(b'\r'), u"") _check_decode(b'\r', "")
self.assertEquals(decoder.decode(b'', final=True), u"\n") _check_decode(b'', "\n", final=True)
self.assertEquals(decoder.decode(b'\r', final=True), u"\n") _check_decode(b'\r', "\n", final=True)
self.assertEquals(decoder.decode(b'\r'), u"") _check_decode(b'\r', "")
self.assertEquals(decoder.decode(b'a'), u"\na") _check_decode(b'a', "\na")
self.assertEquals(decoder.decode(b'\r\r\n'), u"\n\n") _check_decode(b'\r\r\n', "\n\n")
self.assertEquals(decoder.decode(b'\r'), u"") _check_decode(b'\r', "")
self.assertEquals(decoder.decode(b'\r'), u"\n") _check_decode(b'\r', "\n")
self.assertEquals(decoder.decode(b'\na'), u"\na") _check_decode(b'\na', "\na")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), u"\u8888\n") _check_decode(b'\xe8\xa2\x88\r\n', "\u8888\n")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), u"\u8888") _check_decode(b'\xe8\xa2\x88', "\u8888")
self.assertEquals(decoder.decode(b'\n'), u"\n") _check_decode(b'\n', "\n")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), u"\u8888") _check_decode(b'\xe8\xa2\x88\r', "\u8888")
self.assertEquals(decoder.decode(b'\n'), u"\n") _check_decode(b'\n', "\n")
def check_newline_decoder(self, decoder, encoding):
result = []
encoder = codecs.getincrementalencoder(encoding)()
def _decode_bytewise(s):
for b in encoder.encode(s):
result.append(decoder.decode(b))
self.assertEquals(decoder.newlines, None)
_decode_bytewise("abc\n\r")
self.assertEquals(decoder.newlines, '\n')
_decode_bytewise("\nabc")
self.assertEquals(decoder.newlines, ('\n', '\r\n'))
_decode_bytewise("abc\r")
self.assertEquals(decoder.newlines, ('\n', '\r\n'))
_decode_bytewise("abc")
self.assertEquals(decoder.newlines, ('\r', '\n', '\r\n'))
_decode_bytewise("abc\r")
self.assertEquals("".join(result), "abc\n\nabcabc\nabcabc")
decoder.reset()
self.assertEquals(decoder.decode("abc".encode(encoding)), "abc")
self.assertEquals(decoder.newlines, None)
def test_newline_decoder(self):
encodings = (
'utf-8', 'latin-1',
'utf-16', 'utf-16-le', 'utf-16-be',
'utf-32', 'utf-32-le', 'utf-32-be',
)
for enc in encodings:
decoder = codecs.getincrementaldecoder(enc)()
decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
self.check_newline_decoder(decoder, enc)
decoder = codecs.getincrementaldecoder("utf-8")() decoder = codecs.getincrementaldecoder("utf-8")()
decoder = io.IncrementalNewlineDecoder(decoder, translate=True) decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
self.assertEquals(decoder.newlines, None) self.check_newline_decoder_utf8(decoder)
decoder.decode(b"abc\n\r")
self.assertEquals(decoder.newlines, u'\n')
decoder.decode(b"\nabc")
self.assertEquals(decoder.newlines, ('\n', '\r\n'))
decoder.decode(b"abc\r")
self.assertEquals(decoder.newlines, ('\n', '\r\n'))
decoder.decode(b"abc")
self.assertEquals(decoder.newlines, ('\r', '\n', '\r\n'))
decoder.decode(b"abc\r")
decoder.reset()
self.assertEquals(decoder.decode(b"abc"), "abc")
self.assertEquals(decoder.newlines, None)
# XXX Tests for open() # XXX Tests for open()

View file

@ -32,6 +32,10 @@ Core and Builtins
Library Library
------- -------
- Issue #4574: fix a crash in io.IncrementalNewlineDecoder when a carriage
return encodes to more than one byte in the source encoding (e.g. UTF-16)
and gets split on a chunk boundary.
- Issue #4223: inspect.getsource() will now correctly display source code - Issue #4223: inspect.getsource() will now correctly display source code
for packages loaded via zipimport (or any other conformant PEP 302 for packages loaded via zipimport (or any other conformant PEP 302
loader). Original patch by Alexander Belopolsky. loader). Original patch by Alexander Belopolsky.