mirror of
https://github.com/python/cpython.git
synced 2025-08-29 13:15:11 +00:00
Instead of pickling the whole decoder, use the new getstate/setstate API.
This commit is contained in:
parent
3abcb013b8
commit
d76e7796c9
2 changed files with 57 additions and 33 deletions
60
Lib/io.py
60
Lib/io.py
|
@ -18,7 +18,7 @@ XXX don't use assert to validate input requirements
|
||||||
XXX whenever an argument is None, use the default value
|
XXX whenever an argument is None, use the default value
|
||||||
XXX read/write ops should check readable/writable
|
XXX read/write ops should check readable/writable
|
||||||
XXX buffered readinto should work with arbitrary buffer objects
|
XXX buffered readinto should work with arbitrary buffer objects
|
||||||
XXX use incremental encoder for text output, at least for UTF-16
|
XXX use incremental encoder for text output, at least for UTF-16 and UTF-8-SIG
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = ("Guido van Rossum <guido@python.org>, "
|
__author__ = ("Guido van Rossum <guido@python.org>, "
|
||||||
|
@ -36,11 +36,6 @@ import codecs
|
||||||
import _fileio
|
import _fileio
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
try:
|
|
||||||
import cPickle as pickle
|
|
||||||
except ImportError:
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
# XXX Shouldn't we use st_blksize whenever we can?
|
# XXX Shouldn't we use st_blksize whenever we can?
|
||||||
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
|
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
|
||||||
|
|
||||||
|
@ -957,17 +952,16 @@ class TextIOWrapper(TextIOBase):
|
||||||
self._newline = newline or os.linesep
|
self._newline = newline or os.linesep
|
||||||
self._fix_newlines = newline is None
|
self._fix_newlines = newline is None
|
||||||
self._decoder = None
|
self._decoder = None
|
||||||
self._decoder_in_rest_pickle = None
|
|
||||||
self._pending = ""
|
self._pending = ""
|
||||||
self._snapshot = None
|
self._snapshot = None
|
||||||
self._seekable = self._telling = self.buffer.seekable()
|
self._seekable = self._telling = self.buffer.seekable()
|
||||||
|
|
||||||
# A word about _snapshot. This attribute is either None, or a
|
# A word about _snapshot. This attribute is either None, or a
|
||||||
# tuple (decoder_pickle, readahead, pending) where decoder_pickle
|
# tuple (decoder_state, readahead, pending) where decoder_state is
|
||||||
# is a pickled decoder state, readahead is the chunk of bytes that
|
# the second (integer) item of the decoder state, readahead is the
|
||||||
# was read, and pending is the characters that were rendered by
|
# chunk of bytes that was read, and pending is the characters that
|
||||||
# the decoder after feeding it those bytes. We use this to
|
# were rendered by the decoder after feeding it those bytes. We
|
||||||
# reconstruct intermediate decoder states in tell().
|
# use this to reconstruct intermediate decoder states in tell().
|
||||||
|
|
||||||
def _seekable(self):
|
def _seekable(self):
|
||||||
return self._seekable
|
return self._seekable
|
||||||
|
@ -1005,10 +999,6 @@ class TextIOWrapper(TextIOBase):
|
||||||
raise IOError("Can't find an incremental decoder for encoding %s" %
|
raise IOError("Can't find an incremental decoder for encoding %s" %
|
||||||
self._encoding)
|
self._encoding)
|
||||||
decoder = self._decoder = make_decoder() # XXX: errors
|
decoder = self._decoder = make_decoder() # XXX: errors
|
||||||
if isinstance(decoder, codecs.BufferedIncrementalDecoder):
|
|
||||||
# XXX Hack: make the codec use bytes instead of strings
|
|
||||||
decoder.buffer = b""
|
|
||||||
self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell()
|
|
||||||
return decoder
|
return decoder
|
||||||
|
|
||||||
def _read_chunk(self):
|
def _read_chunk(self):
|
||||||
|
@ -1017,15 +1007,13 @@ class TextIOWrapper(TextIOBase):
|
||||||
readahead = self.buffer.read1(self._CHUNK_SIZE)
|
readahead = self.buffer.read1(self._CHUNK_SIZE)
|
||||||
pending = self._decoder.decode(readahead, not readahead)
|
pending = self._decoder.decode(readahead, not readahead)
|
||||||
return readahead, pending
|
return readahead, pending
|
||||||
decoder_state = pickle.dumps(self._decoder, 2)
|
decoder_buffer, decoder_state = self._decoder.getstate()
|
||||||
readahead = self.buffer.read1(self._CHUNK_SIZE)
|
readahead = self.buffer.read1(self._CHUNK_SIZE)
|
||||||
pending = self._decoder.decode(readahead, not readahead)
|
pending = self._decoder.decode(readahead, not readahead)
|
||||||
self._snapshot = (decoder_state, readahead, pending)
|
self._snapshot = (decoder_state, decoder_buffer + readahead, pending)
|
||||||
return readahead, pending
|
return readahead, pending
|
||||||
|
|
||||||
def _encode_decoder_state(self, ds, pos):
|
def _encode_decoder_state(self, ds, pos):
|
||||||
if ds == self._decoder_in_rest_pickle:
|
|
||||||
return pos
|
|
||||||
x = 0
|
x = 0
|
||||||
for i in bytes(ds):
|
for i in bytes(ds):
|
||||||
x = x<<8 | i
|
x = x<<8 | i
|
||||||
|
@ -1048,7 +1036,8 @@ class TextIOWrapper(TextIOBase):
|
||||||
raise IOError("Telling position disabled by next() call")
|
raise IOError("Telling position disabled by next() call")
|
||||||
self.flush()
|
self.flush()
|
||||||
position = self.buffer.tell()
|
position = self.buffer.tell()
|
||||||
if self._decoder is None or self._snapshot is None:
|
decoder = self._decoder
|
||||||
|
if decoder is None or self._snapshot is None:
|
||||||
assert self._pending == ""
|
assert self._pending == ""
|
||||||
return position
|
return position
|
||||||
decoder_state, readahead, pending = self._snapshot
|
decoder_state, readahead, pending = self._snapshot
|
||||||
|
@ -1056,15 +1045,21 @@ class TextIOWrapper(TextIOBase):
|
||||||
needed = len(pending) - len(self._pending)
|
needed = len(pending) - len(self._pending)
|
||||||
if not needed:
|
if not needed:
|
||||||
return self._encode_decoder_state(decoder_state, position)
|
return self._encode_decoder_state(decoder_state, position)
|
||||||
decoder = pickle.loads(decoder_state)
|
saved_state = decoder.getstate()
|
||||||
n = 0
|
try:
|
||||||
bb = bytes(1)
|
decoder.setstate(("", decoder_state))
|
||||||
for i, bb[0] in enumerate(readahead):
|
n = 0
|
||||||
n += len(decoder.decode(bb))
|
bb = bytes(1)
|
||||||
if n >= needed:
|
for i, bb[0] in enumerate(readahead):
|
||||||
decoder_state = pickle.dumps(decoder, 2)
|
n += len(decoder.decode(bb))
|
||||||
return self._encode_decoder_state(decoder_state, position+i+1)
|
if n >= needed:
|
||||||
raise IOError("Can't reconstruct logical file position")
|
decoder_buffer, decoder_state = decoder.getstate()
|
||||||
|
return self._encode_decoder_state(
|
||||||
|
decoder_state,
|
||||||
|
position + (i+1) - len(decoder_buffer))
|
||||||
|
raise IOError("Can't reconstruct logical file position")
|
||||||
|
finally:
|
||||||
|
decoder.setstate(saved_state)
|
||||||
|
|
||||||
def seek(self, pos, whence=0):
|
def seek(self, pos, whence=0):
|
||||||
if not self._seekable:
|
if not self._seekable:
|
||||||
|
@ -1097,12 +1092,11 @@ class TextIOWrapper(TextIOBase):
|
||||||
self._pending = ""
|
self._pending = ""
|
||||||
self._decoder = None
|
self._decoder = None
|
||||||
return pos
|
return pos
|
||||||
decoder = pickle.loads(ds)
|
decoder = self._decoder or self._get_decoder()
|
||||||
|
decoder.set_state(("", ds))
|
||||||
self.buffer.seek(pos)
|
self.buffer.seek(pos)
|
||||||
self._snapshot = (ds, b"", "")
|
self._snapshot = (ds, b"", "")
|
||||||
self._pending = ""
|
self._pending = ""
|
||||||
if not self._decoder_in_rest_pickle:
|
|
||||||
self._get_decoder() # For its side effect
|
|
||||||
self._decoder = decoder
|
self._decoder = decoder
|
||||||
return orig_pos
|
return orig_pos
|
||||||
|
|
||||||
|
|
|
@ -581,6 +581,36 @@ class TextIOWrapperTest(unittest.TestCase):
|
||||||
self.assertEquals(f.tell(), p2)
|
self.assertEquals(f.tell(), p2)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
def testSeeking(self):
|
||||||
|
chunk_size = io.TextIOWrapper._CHUNK_SIZE
|
||||||
|
prefix_size = chunk_size - 2
|
||||||
|
u_prefix = u"a" * prefix_size
|
||||||
|
prefix = bytes(u_prefix.encode("utf-8"))
|
||||||
|
self.assertEquals(len(u_prefix), len(prefix))
|
||||||
|
u_suffix = u"\u8888\n"
|
||||||
|
suffix = bytes(u_suffix.encode("utf-8"))
|
||||||
|
line = prefix + suffix
|
||||||
|
f = io.open(test_support.TESTFN, "wb")
|
||||||
|
f.write(line*2)
|
||||||
|
f.close()
|
||||||
|
f = io.open(test_support.TESTFN, "r", encoding="utf-8")
|
||||||
|
s = f.read(prefix_size)
|
||||||
|
self.assertEquals(s, prefix)
|
||||||
|
self.assertEquals(f.tell(), prefix_size)
|
||||||
|
self.assertEquals(f.readline(), u_suffix)
|
||||||
|
|
||||||
|
def testSeekingToo(self):
|
||||||
|
# Regression test for a specific bug
|
||||||
|
data = b'\xe0\xbf\xbf\n'
|
||||||
|
f = io.open(test_support.TESTFN, "wb")
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
f = io.open(test_support.TESTFN, "r", encoding="utf-8")
|
||||||
|
f._CHUNK_SIZE # Just test that it exists
|
||||||
|
f._CHUNK_SIZE = 2
|
||||||
|
f.readline()
|
||||||
|
f.tell()
|
||||||
|
|
||||||
def timingTest(self):
|
def timingTest(self):
|
||||||
timer = time.time
|
timer = time.time
|
||||||
enc = "utf8"
|
enc = "utf8"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue