Patch #1436130: codecs.lookup() now returns a CodecInfo object (a subclass

of tuple) that provides incremental decoders and encoders (a way to use stateful codecs without the stream API). Functions codecs.getincrementaldecoder() and codecs.getincrementalencoder() have been added.
2025-07-27 21:24:32 +00:00 · 2006-03-15 11:35:15 +00:00 · 2006-03-15 11:35:15 +00:00 · abb02e5994
commit abb02e5994
parent e2ebb2d7f7
98 changed files with 2212 additions and 420 deletions
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@ -73,6 +73,23 @@ BOM64_BE = BOM_UTF32_BE

 ### Codec base classes (defining the API)

+class CodecInfo(tuple):
+
+    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
+        incrementalencoder=None, incrementaldecoder=None, name=None):
+        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
+        self.name = name
+        self.encode = encode
+        self.decode = decode
+        self.incrementalencoder = incrementalencoder
+        self.incrementaldecoder = incrementaldecoder
+        self.streamwriter = streamwriter
+        self.streamreader = streamreader
+        return self
+
+    def __repr__(self):
+        return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
+
 class Codec:

    """ Defines the interface for stateless encoders/decoders.
@ -137,6 +154,88 @@ class Codec:
        """
        raise NotImplementedError

+class IncrementalEncoder(object):
+    """
+    A IncrementalEncoder encodes an input in multiple steps. The input can be
+    passed piece by piece to the encode() method. The IncrementalEncoder remembers
+    the state of the Encoding process between calls to encode().
+    """
+    def __init__(self, errors='strict'):
+        """
+        Creates a IncrementalEncoder instance.
+
+        The IncrementalEncoder may use different error handling schemes by
+        providing the errors keyword argument. See the module docstring
+        for a list of possible values.
+        """
+        self.errors = errors
+        self.buffer = ""
+
+    def encode(self, input, final=False):
+        """
+        Encodes input and returns the resulting object.
+        """
+        raise NotImplementedError
+
+    def reset(self):
+        """
+        Resets the encoder to the initial state.
+        """
+
+class IncrementalDecoder(object):
+    """
+    An IncrementalDecoder decodes an input in multiple steps. The input can be
+    passed piece by piece to the decode() method. The IncrementalDecoder
+    remembers the state of the decoding process between calls to decode().
+    """
+    def __init__(self, errors='strict'):
+        """
+        Creates a IncrementalDecoder instance.
+
+        The IncrementalDecoder may use different error handling schemes by
+        providing the errors keyword argument. See the module docstring
+        for a list of possible values.
+        """
+        self.errors = errors
+
+    def decode(self, input, final=False):
+        """
+        Decodes input and returns the resulting object.
+        """
+        raise NotImplementedError
+
+    def reset(self):
+        """
+        Resets the decoder to the initial state.
+        """
+
+class BufferedIncrementalDecoder(IncrementalDecoder):
+    """
+    This subclass of IncrementalDecoder can be used as the baseclass for an
+    incremental decoder if the decoder must be able to handle incomplete byte
+    sequences.
+    """
+    def __init__(self, errors='strict'):
+        IncrementalDecoder.__init__(self, errors)
+        self.buffer = "" # undecoded input that is kept between calls to decode()
+
+    def _buffer_decode(self, input, errors, final):
+        # Overwrite this method in subclasses: It must decode input
+        # and return an (output, length consumed) tuple
+        raise NotImplementedError
+
+    def decode(self, input, final=False):
+        # decode input (taking the buffer into account)
+        data = self.buffer + input
+        (result, consumed) = self._buffer_decode(data, self.errors, final)
+        # keep undecoded input until the next call
+        self.buffer = data[consumed:]
+        return result
+
+    def reset(self):
+        IncrementalDecoder.reset(self)
+        self.bytebuffer = ""
+
 #
 # The StreamWriter and StreamReader class provide generic working
 # interfaces which can be used to implement new encoding submodules
@ -666,8 +765,8 @@ def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
    file = __builtin__.open(filename, mode, buffering)
    if encoding is None:
        return file
-    (e, d, sr, sw) = lookup(encoding)
-    srw = StreamReaderWriter(file, sr, sw, errors)
+    info = lookup(encoding)
+    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
    # Add attributes to simplify introspection
    srw.encoding = encoding
    return srw
@ -699,11 +798,9 @@ def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
    """
    if file_encoding is None:
        file_encoding = data_encoding
-    encode, decode = lookup(data_encoding)[:2]
-    Reader, Writer = lookup(file_encoding)[2:]
-    sr = StreamRecoder(file,
-                       encode, decode, Reader, Writer,
-                       errors)
+    info = lookup(data_encoding)
+    sr = StreamRecoder(file, info.encode, info.decode,
+                       info.streamreader, info.streamwriter, errors)
    # Add attributes to simplify introspection
    sr.data_encoding = data_encoding
    sr.file_encoding = file_encoding
@ -719,7 +816,7 @@ def getencoder(encoding):
        Raises a LookupError in case the encoding cannot be found.

    """
-    return lookup(encoding)[0]
+    return lookup(encoding).encode

 def getdecoder(encoding):

@ -729,7 +826,35 @@ def getdecoder(encoding):
        Raises a LookupError in case the encoding cannot be found.

    """
-    return lookup(encoding)[1]
+    return lookup(encoding).decode
+
+def getincrementalencoder(encoding):
+
+    """ Lookup up the codec for the given encoding and return
+        its IncrementalEncoder class or factory function.
+
+        Raises a LookupError in case the encoding cannot be found
+        or the codecs doesn't provide an incremental encoder.
+
+    """
+    encoder = lookup(encoding).incrementalencoder
+    if encoder is None:
+        raise LookupError(encoding)
+    return encoder
+
+def getincrementaldecoder(encoding):
+
+    """ Lookup up the codec for the given encoding and return
+        its IncrementalDecoder class or factory function.
+
+        Raises a LookupError in case the encoding cannot be found
+        or the codecs doesn't provide an incremental decoder.
+
+    """
+    decoder = lookup(encoding).incrementaldecoder
+    if decoder is None:
+        raise LookupError(encoding)
+    return decoder

 def getreader(encoding):

@ -739,7 +864,7 @@ def getreader(encoding):
        Raises a LookupError in case the encoding cannot be found.

    """
-    return lookup(encoding)[2]
+    return lookup(encoding).streamreader

 def getwriter(encoding):

@ -749,7 +874,43 @@ def getwriter(encoding):
        Raises a LookupError in case the encoding cannot be found.

    """
-    return lookup(encoding)[3]
+    return lookup(encoding).streamwriter
+
+def iterencode(iterator, encoding, errors='strict', **kwargs):
+    """
+    Encoding iterator.
+
+    Encodes the input strings from the iterator using a IncrementalEncoder.
+
+    errors and kwargs are passed through to the IncrementalEncoder
+    constructor.
+    """
+    encoder = getincrementalencoder(encoding)(errors, **kwargs)
+    for input in iterator:
+        output = encoder.encode(input)
+        if output:
+            yield output
+    output = encoder.encode("", True)
+    if output:
+        yield output
+
+def iterdecode(iterator, encoding, errors='strict', **kwargs):
+    """
+    Decoding iterator.
+
+    Decodes the input strings from the iterator using a IncrementalDecoder.
+
+    errors and kwargs are passed through to the IncrementalDecoder
+    constructor.
+    """
+    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
+    for input in iterator:
+        output = decoder.decode(input)
+        if output:
+            yield output
+    output = decoder.decode("", True)
+    if output:
+        yield output

 ### Helpers for charmap-based codecs