mirror of
https://github.com/python/cpython.git
synced 2025-10-09 16:34:44 +00:00
Clean up the TextIOWrapper code; pick better names; improve documentation.
This commit is contained in:
parent
dbe28e573e
commit
593cd6b843
1 changed files with 122 additions and 124 deletions
246
Lib/io.py
246
Lib/io.py
|
@ -1179,20 +1179,19 @@ class TextIOWrapper(TextIOBase):
|
|||
self._writenl = newline or os.linesep
|
||||
self._encoder = None
|
||||
self._decoder = None
|
||||
self._decoded_text = "" # buffer for text produced by decoder
|
||||
self._decoded_text_offset = 0 # offset to text returned by read()
|
||||
self._decoded_chars = '' # buffer for text returned from decoder
|
||||
self._decoded_chars_used = 0 # offset into _decoded_chars for read()
|
||||
self._snapshot = None # info for reconstructing decoder state
|
||||
self._seekable = self._telling = self.buffer.seekable()
|
||||
|
||||
# A word about _snapshot. This attribute is either None, or a tuple
|
||||
# (decoder_state, next_input) where decoder_state is the second
|
||||
# (integer) item of the decoder state, and next_input is the chunk
|
||||
# of bytes that comes after the snapshot point in the input.
|
||||
# We use this to reconstruct intermediate decoder states in tell().
|
||||
# self._snapshot is either None, or a tuple (dec_flags, next_input)
|
||||
# where dec_flags is the second (integer) item of the decoder state
|
||||
# and next_input is the chunk of input bytes that comes next after the
|
||||
# snapshot point. We use this to reconstruct decoder states in tell().
|
||||
|
||||
# Naming convention:
|
||||
# - integer variables ending in "_bytes" count input bytes
|
||||
# - integer variables ending in "_chars" count decoded characters
|
||||
# - "bytes_..." for integer variables that count input bytes
|
||||
# - "chars_..." for integer variables that count decoded characters
|
||||
|
||||
def __repr__(self):
|
||||
return '<TIOW %x>' % id(self)
|
||||
|
@ -1267,62 +1266,79 @@ class TextIOWrapper(TextIOBase):
|
|||
self._decoder = decoder
|
||||
return decoder
|
||||
|
||||
# The following three methods implement an ADT for _decoded_chars.
|
||||
# Text returned from the decoder is buffered here until the client
|
||||
# requests it by calling our read() or readline() method.
|
||||
def _set_decoded_chars(self, chars):
|
||||
"""Set the _decoded_chars buffer."""
|
||||
self._decoded_chars = chars
|
||||
self._decoded_chars_used = 0
|
||||
|
||||
def _get_decoded_chars(self, n=None):
|
||||
"""Advance into the _decoded_chars buffer."""
|
||||
offset = self._decoded_chars_used
|
||||
if n is None:
|
||||
chars = self._decoded_chars[offset:]
|
||||
else:
|
||||
chars = self._decoded_chars[offset:offset + n]
|
||||
self._decoded_chars_used += len(chars)
|
||||
return chars
|
||||
|
||||
def _rewind_decoded_chars(self, n):
|
||||
"""Rewind the _decoded_chars buffer."""
|
||||
if self._decoded_chars_used < n:
|
||||
raise AssertionError("rewind decoded_chars out of bounds")
|
||||
self._decoded_chars_used -= n
|
||||
|
||||
def _read_chunk(self):
|
||||
"""
|
||||
Read and decode the next chunk of data from the BufferedReader.
|
||||
|
||||
The return value is True unless EOF was reached. The decoded string
|
||||
is placed in self._decoded_text (replacing its previous value).
|
||||
(The entire input chunk is sent to the decoder, though some of it
|
||||
may remain buffered in the decoder, yet to be converted.)
|
||||
is placed in self._decoded_chars (replacing its previous value).
|
||||
The entire input chunk is sent to the decoder, though some of it
|
||||
may remain buffered in the decoder, yet to be converted.
|
||||
"""
|
||||
|
||||
if self._decoder is None:
|
||||
raise ValueError("no decoder")
|
||||
if not self._telling:
|
||||
# No one should call tell(), so don't bother taking a snapshot.
|
||||
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
|
||||
eof = not input_chunk
|
||||
self._decoded_text = self._decoder.decode(input_chunk, eof)
|
||||
self._decoded_text_offset = 0
|
||||
return not eof
|
||||
|
||||
# The cookie returned by tell() cannot include the contents of
|
||||
# the decoder's buffer, so we need to snapshot a point in the
|
||||
# input where the decoder has nothing in its input buffer.
|
||||
if self._telling:
|
||||
# To prepare for tell(), we need to snapshot a point in the
|
||||
# file where the decoder's input buffer is empty.
|
||||
|
||||
dec_buffer, dec_flags = self._decoder.getstate()
|
||||
# The state tuple returned by getstate() contains the decoder's
|
||||
# input buffer and an integer representing any other state. Thus,
|
||||
# there is a valid snapshot point len(decoder_buffer) bytes ago in
|
||||
# the input, with the state tuple (b'', decoder_state).
|
||||
dec_buffer, dec_flags = self._decoder.getstate()
|
||||
# Given this, we know there was a valid snapshot point
|
||||
# len(dec_buffer) bytes ago with decoder state (b'', dec_flags).
|
||||
|
||||
# Read a chunk, decode it, and put the result in self._decoded_chars.
|
||||
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
|
||||
eof = not input_chunk
|
||||
self._decoded_text = self._decoder.decode(input_chunk, eof)
|
||||
self._decoded_text_offset = 0
|
||||
self._set_decoded_chars(self._decoder.decode(input_chunk, eof))
|
||||
|
||||
if self._telling:
|
||||
# At the snapshot point, len(dec_buffer) bytes before the read,
|
||||
# the next input to be decoded is dec_buffer + input_chunk.
|
||||
self._snapshot = (dec_flags, dec_buffer + input_chunk)
|
||||
|
||||
# At the snapshot point, len(dec_buffer) bytes ago, the next input
|
||||
# to be passed to the decoder is dec_buffer + input_chunk.
|
||||
self._snapshot = (dec_flags, dec_buffer + input_chunk)
|
||||
return not eof
|
||||
|
||||
def _pack_cookie(self, position, dec_flags=0,
|
||||
feed_bytes=0, need_eof=0, skip_chars=0):
|
||||
bytes_to_feed=0, need_eof=0, chars_to_skip=0):
|
||||
# The meaning of a tell() cookie is: seek to position, set the
|
||||
# decoder flags to dec_flags, read feed_bytes bytes, feed them
|
||||
# decoder flags to dec_flags, read bytes_to_feed bytes, feed them
|
||||
# into the decoder with need_eof as the EOF flag, then skip
|
||||
# skip_chars characters of the decoded result. For most simple
|
||||
# decoders, this should often just be the position.
|
||||
return (position | (dec_flags<<64) | (feed_bytes<<128) |
|
||||
(skip_chars<<192) | bool(need_eof)<<256)
|
||||
# chars_to_skip characters of the decoded result. For most simple
|
||||
# decoders, tell() will often just give a byte offset in the file.
|
||||
return (position | (dec_flags<<64) | (bytes_to_feed<<128) |
|
||||
(chars_to_skip<<192) | bool(need_eof)<<256)
|
||||
|
||||
def _unpack_cookie(self, bigint):
|
||||
rest, position = divmod(bigint, 1<<64)
|
||||
rest, dec_flags = divmod(rest, 1<<64)
|
||||
rest, feed_bytes = divmod(rest, 1<<64)
|
||||
need_eof, skip_chars = divmod(rest, 1<<64)
|
||||
return position, dec_flags, feed_bytes, need_eof, skip_chars
|
||||
rest, bytes_to_feed = divmod(rest, 1<<64)
|
||||
need_eof, chars_to_skip = divmod(rest, 1<<64)
|
||||
return position, dec_flags, bytes_to_feed, need_eof, chars_to_skip
|
||||
|
||||
def tell(self):
|
||||
if not self._seekable:
|
||||
|
@ -1333,7 +1349,7 @@ class TextIOWrapper(TextIOBase):
|
|||
position = self.buffer.tell()
|
||||
decoder = self._decoder
|
||||
if decoder is None or self._snapshot is None:
|
||||
if self._decoded_text:
|
||||
if self._decoded_chars:
|
||||
# This should never happen.
|
||||
raise AssertionError("pending decoded text")
|
||||
return position
|
||||
|
@ -1342,51 +1358,48 @@ class TextIOWrapper(TextIOBase):
|
|||
dec_flags, next_input = self._snapshot
|
||||
position -= len(next_input)
|
||||
|
||||
# How many decoded characters have been returned since the snapshot?
|
||||
skip_chars = self._decoded_text_offset
|
||||
if skip_chars == 0:
|
||||
# How many decoded characters have been used up since the snapshot?
|
||||
chars_to_skip = self._decoded_chars_used
|
||||
if chars_to_skip == 0:
|
||||
# We haven't moved from the snapshot point.
|
||||
return self._pack_cookie(position, dec_flags)
|
||||
|
||||
# Walk the decoder forward, one byte at a time, to find the minimum
|
||||
# input necessary to give us the decoded characters we need to skip.
|
||||
# As we go, look for the "safe point" nearest to the current location
|
||||
# (i.e. a point where the decoder has nothing buffered, so we can
|
||||
# safely start from there when trying to return to this location).
|
||||
# Starting from the snapshot position, we will walk the decoder
|
||||
# forward until it gives us enough decoded characters.
|
||||
saved_state = decoder.getstate()
|
||||
try:
|
||||
decoder.setstate((b"", dec_flags))
|
||||
fed_bytes = 0
|
||||
decoded_chars = 0
|
||||
# Note our initial start point.
|
||||
decoder.setstate((b'', dec_flags))
|
||||
start_pos = position
|
||||
start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0
|
||||
need_eof = 0
|
||||
last_safe_point = (dec_flags, 0, 0)
|
||||
|
||||
# Feed the decoder one byte at a time. As we go, note the
|
||||
# nearest "safe start point" before the current location
|
||||
# (a point where the decoder has nothing buffered, so seek()
|
||||
# can safely start from there and advance to this location).
|
||||
next_byte = bytearray(1)
|
||||
for next_byte[0] in next_input:
|
||||
decoded = decoder.decode(next_byte)
|
||||
fed_bytes += 1
|
||||
decoded_chars += len(decoded)
|
||||
bytes_fed += 1
|
||||
chars_decoded += len(decoder.decode(next_byte))
|
||||
dec_buffer, dec_flags = decoder.getstate()
|
||||
if not dec_buffer and decoded_chars <= skip_chars:
|
||||
# Decoder buffer is empty, so it's safe to start from here.
|
||||
last_safe_point = (dec_flags, fed_bytes, decoded_chars)
|
||||
if decoded_chars >= skip_chars:
|
||||
if not dec_buffer and chars_decoded <= chars_to_skip:
|
||||
# Decoder buffer is empty, so this is a safe start point.
|
||||
start_pos += bytes_fed
|
||||
chars_to_skip -= chars_decoded
|
||||
start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0
|
||||
if chars_decoded >= chars_to_skip:
|
||||
break
|
||||
else:
|
||||
# We didn't get enough decoded data; signal EOF to get more.
|
||||
decoded = decoder.decode(b"", final=True)
|
||||
decoded_chars += len(decoded)
|
||||
chars_decoded += len(decoder.decode(b'', final=True))
|
||||
need_eof = 1
|
||||
if decoded_chars < skip_chars:
|
||||
if chars_decoded < chars_to_skip:
|
||||
raise IOError("can't reconstruct logical file position")
|
||||
|
||||
# Advance the starting position to the last safe point.
|
||||
dec_flags, safe_fed_bytes, safe_decoded_chars = last_safe_point
|
||||
position += safe_fed_bytes
|
||||
fed_bytes -= safe_fed_bytes
|
||||
skip_chars -= safe_decoded_chars
|
||||
# The returned cookie corresponds to the last safe start point.
|
||||
return self._pack_cookie(
|
||||
position, dec_flags, fed_bytes, need_eof, skip_chars)
|
||||
start_pos, start_flags, bytes_fed, need_eof, chars_to_skip)
|
||||
finally:
|
||||
decoder.setstate(saved_state)
|
||||
|
||||
|
@ -1405,7 +1418,8 @@ class TextIOWrapper(TextIOBase):
|
|||
raise IOError("can't do nonzero end-relative seeks")
|
||||
self.flush()
|
||||
position = self.buffer.seek(0, 2)
|
||||
self._clear_decoded_text()
|
||||
self._set_decoded_chars('')
|
||||
self._snapshot = None
|
||||
if self._decoder:
|
||||
self._decoder.reset()
|
||||
return position
|
||||
|
@ -1416,53 +1430,35 @@ class TextIOWrapper(TextIOBase):
|
|||
raise ValueError("negative seek position %r" % (cookie,))
|
||||
self.flush()
|
||||
|
||||
# Seek back to the snapshot point.
|
||||
position, dec_flags, feed_bytes, need_eof, skip_chars = \
|
||||
# The strategy of seek() is to go back to the safe start point
|
||||
# and replay the effect of read(chars_to_skip) from there.
|
||||
start_pos, dec_flags, bytes_to_feed, need_eof, chars_to_skip = \
|
||||
self._unpack_cookie(cookie)
|
||||
self.buffer.seek(position)
|
||||
self._clear_decoded_text()
|
||||
|
||||
if self._decoder or dec_flags or feed_bytes or need_eof:
|
||||
# Restore the decoder flags to their values from the snapshot.
|
||||
self._decoder = self._decoder or self._get_decoder()
|
||||
self._decoder.setstate((b"", dec_flags))
|
||||
self._snapshot = (dec_flags, b'')
|
||||
|
||||
if feed_bytes or need_eof:
|
||||
# Feed feed_bytes bytes to the decoder.
|
||||
input_chunk = self.buffer.read(feed_bytes)
|
||||
self._decoded_text = self._decoder.decode(input_chunk, need_eof)
|
||||
if len(self._decoded_text) < skip_chars:
|
||||
raise IOError("can't restore logical file position")
|
||||
|
||||
# Skip skip_chars of the decoded characters.
|
||||
self._decoded_text_offset = skip_chars
|
||||
|
||||
# Restore the snapshot.
|
||||
self._snapshot = (dec_flags, input_chunk)
|
||||
return cookie
|
||||
|
||||
def _clear_decoded_text(self):
|
||||
"""Reset the _decoded_text buffer."""
|
||||
self._decoded_text = ''
|
||||
self._decoded_text_offset = 0
|
||||
# Seek back to the safe start point.
|
||||
self.buffer.seek(start_pos)
|
||||
self._set_decoded_chars('')
|
||||
self._snapshot = None
|
||||
|
||||
def _emit_decoded_text(self, n=None):
|
||||
"""Advance into the _decoded_text buffer."""
|
||||
offset = self._decoded_text_offset
|
||||
if n is None:
|
||||
text = self._decoded_text[offset:]
|
||||
else:
|
||||
text = self._decoded_text[offset:offset + n]
|
||||
self._decoded_text_offset += len(text)
|
||||
return text
|
||||
# Restore the decoder to its state from the safe start point.
|
||||
if self._decoder or dec_flags or chars_to_skip:
|
||||
self._decoder = self._decoder or self._get_decoder()
|
||||
self._decoder.setstate((b'', dec_flags))
|
||||
self._snapshot = (dec_flags, b'')
|
||||
|
||||
def _unemit_decoded_text(self, n):
|
||||
"""Rewind the _decoded_text buffer."""
|
||||
if self._decoded_text_offset < n:
|
||||
raise AssertionError("unemit out of bounds")
|
||||
self._decoded_text_offset -= n
|
||||
if chars_to_skip:
|
||||
# Just like _read_chunk, feed the decoder and save a snapshot.
|
||||
input_chunk = self.buffer.read(bytes_to_feed)
|
||||
self._set_decoded_chars(
|
||||
self._decoder.decode(input_chunk, need_eof))
|
||||
self._snapshot = (dec_flags, input_chunk)
|
||||
|
||||
# Skip chars_to_skip of the decoded characters.
|
||||
if len(self._decoded_chars) < chars_to_skip:
|
||||
raise IOError("can't restore logical file position")
|
||||
self._decoded_chars_used = chars_to_skip
|
||||
|
||||
return cookie
|
||||
|
||||
def read(self, n=None):
|
||||
if n is None:
|
||||
|
@ -1470,17 +1466,18 @@ class TextIOWrapper(TextIOBase):
|
|||
decoder = self._decoder or self._get_decoder()
|
||||
if n < 0:
|
||||
# Read everything.
|
||||
result = (self._emit_decoded_text() +
|
||||
result = (self._get_decoded_chars() +
|
||||
decoder.decode(self.buffer.read(), final=True))
|
||||
self._clear_decoded_text()
|
||||
self._set_decoded_chars('')
|
||||
self._snapshot = None
|
||||
return result
|
||||
else:
|
||||
# Keep reading chunks until we have n characters to return.
|
||||
eof = False
|
||||
result = self._emit_decoded_text(n)
|
||||
result = self._get_decoded_chars(n)
|
||||
while len(result) < n and not eof:
|
||||
eof = not self._read_chunk()
|
||||
result += self._emit_decoded_text(n - len(result))
|
||||
result += self._get_decoded_chars(n - len(result))
|
||||
return result
|
||||
|
||||
def __next__(self):
|
||||
|
@ -1497,7 +1494,7 @@ class TextIOWrapper(TextIOBase):
|
|||
limit = -1
|
||||
|
||||
# Grab all the decoded text (we will rewind any extra bits later).
|
||||
line = self._emit_decoded_text()
|
||||
line = self._get_decoded_chars()
|
||||
|
||||
start = 0
|
||||
decoder = self._decoder or self._get_decoder()
|
||||
|
@ -1558,20 +1555,21 @@ class TextIOWrapper(TextIOBase):
|
|||
# No line ending seen yet - get more data
|
||||
more_line = ''
|
||||
while self._read_chunk():
|
||||
if self._decoded_text:
|
||||
if self._decoded_chars:
|
||||
break
|
||||
if self._decoded_text:
|
||||
line += self._emit_decoded_text()
|
||||
if self._decoded_chars:
|
||||
line += self._get_decoded_chars()
|
||||
else:
|
||||
# end of file
|
||||
self._clear_decoded_text()
|
||||
self._set_decoded_chars('')
|
||||
self._snapshot = None
|
||||
return line
|
||||
|
||||
if limit >= 0 and endpos > limit:
|
||||
endpos = limit # don't exceed limit
|
||||
|
||||
# Rewind _decoded_text to just after the line ending we found.
|
||||
self._unemit_decoded_text(len(line) - endpos)
|
||||
# Rewind _decoded_chars to just after the line ending we found.
|
||||
self._rewind_decoded_chars(len(line) - endpos)
|
||||
return line[:endpos]
|
||||
|
||||
@property
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue