mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 11:49:12 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			1101 lines
		
	
	
	
		
			34 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			1101 lines
		
	
	
	
		
			34 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
""" codecs -- Python Codec Registry, API and helpers.
 | 
						|
 | 
						|
 | 
						|
Written by Marc-Andre Lemburg (mal@lemburg.com).
 | 
						|
 | 
						|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
 | 
						|
 | 
						|
"""#"
 | 
						|
 | 
						|
import builtins, sys
 | 
						|
 | 
						|
### Registry and builtin stateless codec functions
 | 
						|
 | 
						|
try:
 | 
						|
    from _codecs import *
 | 
						|
except ImportError as why:
 | 
						|
    raise SystemError('Failed to load the builtin codecs: %s' % why)
 | 
						|
 | 
						|
__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
 | 
						|
           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
 | 
						|
           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
 | 
						|
           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
 | 
						|
           "strict_errors", "ignore_errors", "replace_errors",
 | 
						|
           "xmlcharrefreplace_errors",
 | 
						|
           "register_error", "lookup_error"]
 | 
						|
 | 
						|
### Constants
 | 
						|
 | 
						|
#
 | 
						|
# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
 | 
						|
# and its possible byte string values
 | 
						|
# for UTF8/UTF16/UTF32 output and little/big endian machines
 | 
						|
#
 | 
						|
 | 
						|
# UTF-8
 | 
						|
BOM_UTF8 = b'\xef\xbb\xbf'
 | 
						|
 | 
						|
# UTF-16, little endian
 | 
						|
BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
 | 
						|
 | 
						|
# UTF-16, big endian
 | 
						|
BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
 | 
						|
 | 
						|
# UTF-32, little endian
 | 
						|
BOM_UTF32_LE = b'\xff\xfe\x00\x00'
 | 
						|
 | 
						|
# UTF-32, big endian
 | 
						|
BOM_UTF32_BE = b'\x00\x00\xfe\xff'
 | 
						|
 | 
						|
if sys.byteorder == 'little':
 | 
						|
 | 
						|
    # UTF-16, native endianness
 | 
						|
    BOM = BOM_UTF16 = BOM_UTF16_LE
 | 
						|
 | 
						|
    # UTF-32, native endianness
 | 
						|
    BOM_UTF32 = BOM_UTF32_LE
 | 
						|
 | 
						|
else:
 | 
						|
 | 
						|
    # UTF-16, native endianness
 | 
						|
    BOM = BOM_UTF16 = BOM_UTF16_BE
 | 
						|
 | 
						|
    # UTF-32, native endianness
 | 
						|
    BOM_UTF32 = BOM_UTF32_BE
 | 
						|
 | 
						|
# Old broken names (don't use in new code)
 | 
						|
BOM32_LE = BOM_UTF16_LE
 | 
						|
BOM32_BE = BOM_UTF16_BE
 | 
						|
BOM64_LE = BOM_UTF32_LE
 | 
						|
BOM64_BE = BOM_UTF32_BE
 | 
						|
 | 
						|
 | 
						|
### Codec base classes (defining the API)
 | 
						|
 | 
						|
class CodecInfo(tuple):
 | 
						|
 | 
						|
    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
 | 
						|
        incrementalencoder=None, incrementaldecoder=None, name=None):
 | 
						|
        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
 | 
						|
        self.name = name
 | 
						|
        self.encode = encode
 | 
						|
        self.decode = decode
 | 
						|
        self.incrementalencoder = incrementalencoder
 | 
						|
        self.incrementaldecoder = incrementaldecoder
 | 
						|
        self.streamwriter = streamwriter
 | 
						|
        self.streamreader = streamreader
 | 
						|
        return self
 | 
						|
 | 
						|
    def __repr__(self):
 | 
						|
        return "<%s.%s object for encoding %s at 0x%x>" % \
 | 
						|
                (self.__class__.__module__, self.__class__.__name__,
 | 
						|
                 self.name, id(self))
 | 
						|
 | 
						|
class Codec:
 | 
						|
 | 
						|
    """ Defines the interface for stateless encoders/decoders.
 | 
						|
 | 
						|
        The .encode()/.decode() methods may use different error
 | 
						|
        handling schemes by providing the errors argument. These
 | 
						|
        string values are predefined:
 | 
						|
 | 
						|
         'strict' - raise a ValueError error (or a subclass)
 | 
						|
         'ignore' - ignore the character and continue with the next
 | 
						|
         'replace' - replace with a suitable replacement character;
 | 
						|
                    Python will use the official U+FFFD REPLACEMENT
 | 
						|
                    CHARACTER for the builtin Unicode codecs on
 | 
						|
                    decoding and '?' on encoding.
 | 
						|
         'xmlcharrefreplace' - Replace with the appropriate XML
 | 
						|
                               character reference (only for encoding).
 | 
						|
         'backslashreplace'  - Replace with backslashed escape sequences
 | 
						|
                               (only for encoding).
 | 
						|
 | 
						|
        The set of allowed values can be extended via register_error.
 | 
						|
 | 
						|
    """
 | 
						|
    def encode(self, input, errors='strict'):
 | 
						|
 | 
						|
        """ Encodes the object input and returns a tuple (output
 | 
						|
            object, length consumed).
 | 
						|
 | 
						|
            errors defines the error handling to apply. It defaults to
 | 
						|
            'strict' handling.
 | 
						|
 | 
						|
            The method may not store state in the Codec instance. Use
 | 
						|
            StreamCodec for codecs which have to keep state in order to
 | 
						|
            make encoding/decoding efficient.
 | 
						|
 | 
						|
            The encoder must be able to handle zero length input and
 | 
						|
            return an empty object of the output object type in this
 | 
						|
            situation.
 | 
						|
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def decode(self, input, errors='strict'):
 | 
						|
 | 
						|
        """ Decodes the object input and returns a tuple (output
 | 
						|
            object, length consumed).
 | 
						|
 | 
						|
            input must be an object which provides the bf_getreadbuf
 | 
						|
            buffer slot. Python strings, buffer objects and memory
 | 
						|
            mapped files are examples of objects providing this slot.
 | 
						|
 | 
						|
            errors defines the error handling to apply. It defaults to
 | 
						|
            'strict' handling.
 | 
						|
 | 
						|
            The method may not store state in the Codec instance. Use
 | 
						|
            StreamCodec for codecs which have to keep state in order to
 | 
						|
            make encoding/decoding efficient.
 | 
						|
 | 
						|
            The decoder must be able to handle zero length input and
 | 
						|
            return an empty object of the output object type in this
 | 
						|
            situation.
 | 
						|
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
class IncrementalEncoder(object):
 | 
						|
    """
 | 
						|
    An IncrementalEncoder encodes an input in multiple steps. The input can
 | 
						|
    be passed piece by piece to the encode() method. The IncrementalEncoder
 | 
						|
    remembers the state of the encoding process between calls to encode().
 | 
						|
    """
 | 
						|
    def __init__(self, errors='strict'):
 | 
						|
        """
 | 
						|
        Creates an IncrementalEncoder instance.
 | 
						|
 | 
						|
        The IncrementalEncoder may use different error handling schemes by
 | 
						|
        providing the errors keyword argument. See the module docstring
 | 
						|
        for a list of possible values.
 | 
						|
        """
 | 
						|
        self.errors = errors
 | 
						|
        self.buffer = ""
 | 
						|
 | 
						|
    def encode(self, input, final=False):
 | 
						|
        """
 | 
						|
        Encodes input and returns the resulting object.
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
        """
 | 
						|
        Resets the encoder to the initial state.
 | 
						|
        """
 | 
						|
 | 
						|
    def getstate(self):
 | 
						|
        """
 | 
						|
        Return the current state of the encoder.
 | 
						|
        """
 | 
						|
        return 0
 | 
						|
 | 
						|
    def setstate(self, state):
 | 
						|
        """
 | 
						|
        Set the current state of the encoder. state must have been
 | 
						|
        returned by getstate().
 | 
						|
        """
 | 
						|
 | 
						|
class BufferedIncrementalEncoder(IncrementalEncoder):
 | 
						|
    """
 | 
						|
    This subclass of IncrementalEncoder can be used as the baseclass for an
 | 
						|
    incremental encoder if the encoder must keep some of the output in a
 | 
						|
    buffer between calls to encode().
 | 
						|
    """
 | 
						|
    def __init__(self, errors='strict'):
 | 
						|
        IncrementalEncoder.__init__(self, errors)
 | 
						|
        # unencoded input that is kept between calls to encode()
 | 
						|
        self.buffer = ""
 | 
						|
 | 
						|
    def _buffer_encode(self, input, errors, final):
 | 
						|
        # Overwrite this method in subclasses: It must encode input
 | 
						|
        # and return an (output, length consumed) tuple
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def encode(self, input, final=False):
 | 
						|
        # encode input (taking the buffer into account)
 | 
						|
        data = self.buffer + input
 | 
						|
        (result, consumed) = self._buffer_encode(data, self.errors, final)
 | 
						|
        # keep unencoded input until the next call
 | 
						|
        self.buffer = data[consumed:]
 | 
						|
        return result
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
        IncrementalEncoder.reset(self)
 | 
						|
        self.buffer = ""
 | 
						|
 | 
						|
    def getstate(self):
 | 
						|
        return self.buffer or 0
 | 
						|
 | 
						|
    def setstate(self, state):
 | 
						|
        self.buffer = state or ""
 | 
						|
 | 
						|
class IncrementalDecoder(object):
 | 
						|
    """
 | 
						|
    An IncrementalDecoder decodes an input in multiple steps. The input can
 | 
						|
    be passed piece by piece to the decode() method. The IncrementalDecoder
 | 
						|
    remembers the state of the decoding process between calls to decode().
 | 
						|
    """
 | 
						|
    def __init__(self, errors='strict'):
 | 
						|
        """
 | 
						|
        Create a IncrementalDecoder instance.
 | 
						|
 | 
						|
        The IncrementalDecoder may use different error handling schemes by
 | 
						|
        providing the errors keyword argument. See the module docstring
 | 
						|
        for a list of possible values.
 | 
						|
        """
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def decode(self, input, final=False):
 | 
						|
        """
 | 
						|
        Decode input and returns the resulting object.
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
        """
 | 
						|
        Reset the decoder to the initial state.
 | 
						|
        """
 | 
						|
 | 
						|
    def getstate(self):
 | 
						|
        """
 | 
						|
        Return the current state of the decoder.
 | 
						|
 | 
						|
        This must be a (buffered_input, additional_state_info) tuple.
 | 
						|
        buffered_input must be a bytes object containing bytes that
 | 
						|
        were passed to decode() that have not yet been converted.
 | 
						|
        additional_state_info must be a non-negative integer
 | 
						|
        representing the state of the decoder WITHOUT yet having
 | 
						|
        processed the contents of buffered_input.  In the initial state
 | 
						|
        and after reset(), getstate() must return (b"", 0).
 | 
						|
        """
 | 
						|
        return (b"", 0)
 | 
						|
 | 
						|
    def setstate(self, state):
 | 
						|
        """
 | 
						|
        Set the current state of the decoder.
 | 
						|
 | 
						|
        state must have been returned by getstate().  The effect of
 | 
						|
        setstate((b"", 0)) must be equivalent to reset().
 | 
						|
        """
 | 
						|
 | 
						|
class BufferedIncrementalDecoder(IncrementalDecoder):
 | 
						|
    """
 | 
						|
    This subclass of IncrementalDecoder can be used as the baseclass for an
 | 
						|
    incremental decoder if the decoder must be able to handle incomplete
 | 
						|
    byte sequences.
 | 
						|
    """
 | 
						|
    def __init__(self, errors='strict'):
 | 
						|
        IncrementalDecoder.__init__(self, errors)
 | 
						|
        # undecoded input that is kept between calls to decode()
 | 
						|
        self.buffer = b""
 | 
						|
 | 
						|
    def _buffer_decode(self, input, errors, final):
 | 
						|
        # Overwrite this method in subclasses: It must decode input
 | 
						|
        # and return an (output, length consumed) tuple
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def decode(self, input, final=False):
 | 
						|
        # decode input (taking the buffer into account)
 | 
						|
        data = self.buffer + input
 | 
						|
        (result, consumed) = self._buffer_decode(data, self.errors, final)
 | 
						|
        # keep undecoded input until the next call
 | 
						|
        self.buffer = data[consumed:]
 | 
						|
        return result
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
        IncrementalDecoder.reset(self)
 | 
						|
        self.buffer = b""
 | 
						|
 | 
						|
    def getstate(self):
 | 
						|
        # additional state info is always 0
 | 
						|
        return (self.buffer, 0)
 | 
						|
 | 
						|
    def setstate(self, state):
 | 
						|
        # ignore additional state info
 | 
						|
        self.buffer = state[0]
 | 
						|
 | 
						|
#
 | 
						|
# The StreamWriter and StreamReader class provide generic working
 | 
						|
# interfaces which can be used to implement new encoding submodules
 | 
						|
# very easily. See encodings/utf_8.py for an example on how this is
 | 
						|
# done.
 | 
						|
#
 | 
						|
 | 
						|
class StreamWriter(Codec):
 | 
						|
 | 
						|
    def __init__(self, stream, errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamWriter instance.
 | 
						|
 | 
						|
            stream must be a file-like object open for writing
 | 
						|
            (binary) data.
 | 
						|
 | 
						|
            The StreamWriter may use different error handling
 | 
						|
            schemes by providing the errors keyword argument. These
 | 
						|
            parameters are predefined:
 | 
						|
 | 
						|
             'strict' - raise a ValueError (or a subclass)
 | 
						|
             'ignore' - ignore the character and continue with the next
 | 
						|
             'replace'- replace with a suitable replacement character
 | 
						|
             'xmlcharrefreplace' - Replace with the appropriate XML
 | 
						|
                                   character reference.
 | 
						|
             'backslashreplace'  - Replace with backslashed escape
 | 
						|
                                   sequences (only for encoding).
 | 
						|
 | 
						|
            The set of allowed parameter values can be extended via
 | 
						|
            register_error.
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def write(self, object):
 | 
						|
 | 
						|
        """ Writes the object's contents encoded to self.stream.
 | 
						|
        """
 | 
						|
        data, consumed = self.encode(object, self.errors)
 | 
						|
        self.stream.write(data)
 | 
						|
 | 
						|
    def writelines(self, list):
 | 
						|
 | 
						|
        """ Writes the concatenated list of strings to the stream
 | 
						|
            using .write().
 | 
						|
        """
 | 
						|
        self.write(''.join(list))
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        """ Flushes and resets the codec buffers used for keeping state.
 | 
						|
 | 
						|
            Calling this method should ensure that the data on the
 | 
						|
            output is put into a clean state, that allows appending
 | 
						|
            of new fresh data without having to rescan the whole
 | 
						|
            stream to recover state.
 | 
						|
 | 
						|
        """
 | 
						|
        pass
 | 
						|
 | 
						|
    def seek(self, offset, whence=0):
 | 
						|
        self.stream.seek(offset, whence)
 | 
						|
        if whence == 0 and offset == 0:
 | 
						|
            self.reset()
 | 
						|
 | 
						|
    def __getattr__(self, name,
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream, name)
 | 
						|
 | 
						|
    def __enter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def __exit__(self, type, value, tb):
 | 
						|
        self.stream.close()
 | 
						|
 | 
						|
###
 | 
						|
 | 
						|
class StreamReader(Codec):
 | 
						|
 | 
						|
    charbuffertype = str
 | 
						|
 | 
						|
    def __init__(self, stream, errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamReader instance.
 | 
						|
 | 
						|
            stream must be a file-like object open for reading
 | 
						|
            (binary) data.
 | 
						|
 | 
						|
            The StreamReader may use different error handling
 | 
						|
            schemes by providing the errors keyword argument. These
 | 
						|
            parameters are predefined:
 | 
						|
 | 
						|
             'strict' - raise a ValueError (or a subclass)
 | 
						|
             'ignore' - ignore the character and continue with the next
 | 
						|
             'replace'- replace with a suitable replacement character;
 | 
						|
 | 
						|
            The set of allowed parameter values can be extended via
 | 
						|
            register_error.
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.errors = errors
 | 
						|
        self.bytebuffer = b""
 | 
						|
        self._empty_charbuffer = self.charbuffertype()
 | 
						|
        self.charbuffer = self._empty_charbuffer
 | 
						|
        self.linebuffer = None
 | 
						|
 | 
						|
    def decode(self, input, errors='strict'):
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def read(self, size=-1, chars=-1, firstline=False):
 | 
						|
 | 
						|
        """ Decodes data from the stream self.stream and returns the
 | 
						|
            resulting object.
 | 
						|
 | 
						|
            chars indicates the number of characters to read from the
 | 
						|
            stream. read() will never return more than chars
 | 
						|
            characters, but it might return less, if there are not enough
 | 
						|
            characters available.
 | 
						|
 | 
						|
            size indicates the approximate maximum number of bytes to
 | 
						|
            read from the stream for decoding purposes. The decoder
 | 
						|
            can modify this setting as appropriate. The default value
 | 
						|
            -1 indicates to read and decode as much as possible.  size
 | 
						|
            is intended to prevent having to decode huge files in one
 | 
						|
            step.
 | 
						|
 | 
						|
            If firstline is true, and a UnicodeDecodeError happens
 | 
						|
            after the first line terminator in the input only the first line
 | 
						|
            will be returned, the rest of the input will be kept until the
 | 
						|
            next call to read().
 | 
						|
 | 
						|
            The method should use a greedy read strategy meaning that
 | 
						|
            it should read as much data as is allowed within the
 | 
						|
            definition of the encoding and the given size, e.g.  if
 | 
						|
            optional encoding endings or state markers are available
 | 
						|
            on the stream, these should be read too.
 | 
						|
        """
 | 
						|
        # If we have lines cached, first merge them back into characters
 | 
						|
        if self.linebuffer:
 | 
						|
            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
 | 
						|
            self.linebuffer = None
 | 
						|
 | 
						|
        # read until we get the required number of characters (if available)
 | 
						|
        while True:
 | 
						|
            # can the request can be satisfied from the character buffer?
 | 
						|
            if chars < 0:
 | 
						|
                if size < 0:
 | 
						|
                    if self.charbuffer:
 | 
						|
                        break
 | 
						|
                elif len(self.charbuffer) >= size:
 | 
						|
                    break
 | 
						|
            else:
 | 
						|
                if len(self.charbuffer) >= chars:
 | 
						|
                    break
 | 
						|
            # we need more data
 | 
						|
            if size < 0:
 | 
						|
                newdata = self.stream.read()
 | 
						|
            else:
 | 
						|
                newdata = self.stream.read(size)
 | 
						|
            # decode bytes (those remaining from the last call included)
 | 
						|
            data = self.bytebuffer + newdata
 | 
						|
            try:
 | 
						|
                newchars, decodedbytes = self.decode(data, self.errors)
 | 
						|
            except UnicodeDecodeError as exc:
 | 
						|
                if firstline:
 | 
						|
                    newchars, decodedbytes = \
 | 
						|
                        self.decode(data[:exc.start], self.errors)
 | 
						|
                    lines = newchars.splitlines(True)
 | 
						|
                    if len(lines)<=1:
 | 
						|
                        raise
 | 
						|
                else:
 | 
						|
                    raise
 | 
						|
            # keep undecoded bytes until the next call
 | 
						|
            self.bytebuffer = data[decodedbytes:]
 | 
						|
            # put new characters in the character buffer
 | 
						|
            self.charbuffer += newchars
 | 
						|
            # there was no data available
 | 
						|
            if not newdata:
 | 
						|
                break
 | 
						|
        if chars < 0:
 | 
						|
            # Return everything we've got
 | 
						|
            result = self.charbuffer
 | 
						|
            self.charbuffer = self._empty_charbuffer
 | 
						|
        else:
 | 
						|
            # Return the first chars characters
 | 
						|
            result = self.charbuffer[:chars]
 | 
						|
            self.charbuffer = self.charbuffer[chars:]
 | 
						|
        return result
 | 
						|
 | 
						|
    def readline(self, size=None, keepends=True):
 | 
						|
 | 
						|
        """ Read one line from the input stream and return the
 | 
						|
            decoded data.
 | 
						|
 | 
						|
            size, if given, is passed as size argument to the
 | 
						|
            read() method.
 | 
						|
 | 
						|
        """
 | 
						|
        # If we have lines cached from an earlier read, return
 | 
						|
        # them unconditionally
 | 
						|
        if self.linebuffer:
 | 
						|
            line = self.linebuffer[0]
 | 
						|
            del self.linebuffer[0]
 | 
						|
            if len(self.linebuffer) == 1:
 | 
						|
                # revert to charbuffer mode; we might need more data
 | 
						|
                # next time
 | 
						|
                self.charbuffer = self.linebuffer[0]
 | 
						|
                self.linebuffer = None
 | 
						|
            if not keepends:
 | 
						|
                line = line.splitlines(False)[0]
 | 
						|
            return line
 | 
						|
 | 
						|
        readsize = size or 72
 | 
						|
        line = self._empty_charbuffer
 | 
						|
        # If size is given, we call read() only once
 | 
						|
        while True:
 | 
						|
            data = self.read(readsize, firstline=True)
 | 
						|
            if data:
 | 
						|
                # If we're at a "\r" read one extra character (which might
 | 
						|
                # be a "\n") to get a proper line ending. If the stream is
 | 
						|
                # temporarily exhausted we return the wrong line ending.
 | 
						|
                if (isinstance(data, str) and data.endswith("\r")) or \
 | 
						|
                   (isinstance(data, bytes) and data.endswith(b"\r")):
 | 
						|
                    data += self.read(size=1, chars=1)
 | 
						|
 | 
						|
            line += data
 | 
						|
            lines = line.splitlines(True)
 | 
						|
            if lines:
 | 
						|
                if len(lines) > 1:
 | 
						|
                    # More than one line result; the first line is a full line
 | 
						|
                    # to return
 | 
						|
                    line = lines[0]
 | 
						|
                    del lines[0]
 | 
						|
                    if len(lines) > 1:
 | 
						|
                        # cache the remaining lines
 | 
						|
                        lines[-1] += self.charbuffer
 | 
						|
                        self.linebuffer = lines
 | 
						|
                        self.charbuffer = None
 | 
						|
                    else:
 | 
						|
                        # only one remaining line, put it back into charbuffer
 | 
						|
                        self.charbuffer = lines[0] + self.charbuffer
 | 
						|
                    if not keepends:
 | 
						|
                        line = line.splitlines(False)[0]
 | 
						|
                    break
 | 
						|
                line0withend = lines[0]
 | 
						|
                line0withoutend = lines[0].splitlines(False)[0]
 | 
						|
                if line0withend != line0withoutend: # We really have a line end
 | 
						|
                    # Put the rest back together and keep it until the next call
 | 
						|
                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
 | 
						|
                                      self.charbuffer
 | 
						|
                    if keepends:
 | 
						|
                        line = line0withend
 | 
						|
                    else:
 | 
						|
                        line = line0withoutend
 | 
						|
                    break
 | 
						|
            # we didn't get anything or this was our only try
 | 
						|
            if not data or size is not None:
 | 
						|
                if line and not keepends:
 | 
						|
                    line = line.splitlines(False)[0]
 | 
						|
                break
 | 
						|
            if readsize < 8000:
 | 
						|
                readsize *= 2
 | 
						|
        return line
 | 
						|
 | 
						|
    def readlines(self, sizehint=None, keepends=True):
 | 
						|
 | 
						|
        """ Read all lines available on the input stream
 | 
						|
            and return them as list of lines.
 | 
						|
 | 
						|
            Line breaks are implemented using the codec's decoder
 | 
						|
            method and are included in the list entries.
 | 
						|
 | 
						|
            sizehint, if given, is ignored since there is no efficient
 | 
						|
            way to finding the true end-of-line.
 | 
						|
 | 
						|
        """
 | 
						|
        data = self.read()
 | 
						|
        return data.splitlines(keepends)
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        """ Resets the codec buffers used for keeping state.
 | 
						|
 | 
						|
            Note that no stream repositioning should take place.
 | 
						|
            This method is primarily intended to be able to recover
 | 
						|
            from decoding errors.
 | 
						|
 | 
						|
        """
 | 
						|
        self.bytebuffer = b""
 | 
						|
        self.charbuffer = self._empty_charbuffer
 | 
						|
        self.linebuffer = None
 | 
						|
 | 
						|
    def seek(self, offset, whence=0):
 | 
						|
        """ Set the input stream's current position.
 | 
						|
 | 
						|
            Resets the codec buffers used for keeping state.
 | 
						|
        """
 | 
						|
        self.stream.seek(offset, whence)
 | 
						|
        self.reset()
 | 
						|
 | 
						|
    def __next__(self):
 | 
						|
 | 
						|
        """ Return the next decoded line from the input stream."""
 | 
						|
        line = self.readline()
 | 
						|
        if line:
 | 
						|
            return line
 | 
						|
        raise StopIteration
 | 
						|
 | 
						|
    def __iter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def __getattr__(self, name,
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream, name)
 | 
						|
 | 
						|
    def __enter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def __exit__(self, type, value, tb):
 | 
						|
        self.stream.close()
 | 
						|
 | 
						|
###
 | 
						|
 | 
						|
class StreamReaderWriter:
 | 
						|
 | 
						|
    """ StreamReaderWriter instances allow wrapping streams which
 | 
						|
        work in both read and write modes.
 | 
						|
 | 
						|
        The design is such that one can use the factory functions
 | 
						|
        returned by the codec.lookup() function to construct the
 | 
						|
        instance.
 | 
						|
 | 
						|
    """
 | 
						|
    # Optional attributes set by the file wrappers below
 | 
						|
    encoding = 'unknown'
 | 
						|
 | 
						|
    def __init__(self, stream, Reader, Writer, errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamReaderWriter instance.
 | 
						|
 | 
						|
            stream must be a Stream-like object.
 | 
						|
 | 
						|
            Reader, Writer must be factory functions or classes
 | 
						|
            providing the StreamReader, StreamWriter interface resp.
 | 
						|
 | 
						|
            Error handling is done in the same way as defined for the
 | 
						|
            StreamWriter/Readers.
 | 
						|
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.reader = Reader(stream, errors)
 | 
						|
        self.writer = Writer(stream, errors)
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def read(self, size=-1):
 | 
						|
 | 
						|
        return self.reader.read(size)
 | 
						|
 | 
						|
    def readline(self, size=None):
 | 
						|
 | 
						|
        return self.reader.readline(size)
 | 
						|
 | 
						|
    def readlines(self, sizehint=None):
 | 
						|
 | 
						|
        return self.reader.readlines(sizehint)
 | 
						|
 | 
						|
    def __next__(self):
 | 
						|
 | 
						|
        """ Return the next decoded line from the input stream."""
 | 
						|
        return next(self.reader)
 | 
						|
 | 
						|
    def __iter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def write(self, data):
 | 
						|
 | 
						|
        return self.writer.write(data)
 | 
						|
 | 
						|
    def writelines(self, list):
 | 
						|
 | 
						|
        return self.writer.writelines(list)
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        self.reader.reset()
 | 
						|
        self.writer.reset()
 | 
						|
 | 
						|
    def seek(self, offset, whence=0):
 | 
						|
        self.stream.seek(offset, whence)
 | 
						|
        self.reader.reset()
 | 
						|
        if whence == 0 and offset == 0:
 | 
						|
            self.writer.reset()
 | 
						|
 | 
						|
    def __getattr__(self, name,
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream, name)
 | 
						|
 | 
						|
    # these are needed to make "with codecs.open(...)" work properly
 | 
						|
 | 
						|
    def __enter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def __exit__(self, type, value, tb):
 | 
						|
        self.stream.close()
 | 
						|
 | 
						|
###
 | 
						|
 | 
						|
class StreamRecoder:
 | 
						|
 | 
						|
    """ StreamRecoder instances provide a frontend - backend
 | 
						|
        view of encoding data.
 | 
						|
 | 
						|
        They use the complete set of APIs returned by the
 | 
						|
        codecs.lookup() function to implement their task.
 | 
						|
 | 
						|
        Data written to the stream is first decoded into an
 | 
						|
        intermediate format (which is dependent on the given codec
 | 
						|
        combination) and then written to the stream using an instance
 | 
						|
        of the provided Writer class.
 | 
						|
 | 
						|
        In the other direction, data is read from the stream using a
 | 
						|
        Reader instance and then return encoded data to the caller.
 | 
						|
 | 
						|
    """
 | 
						|
    # Optional attributes set by the file wrappers below
 | 
						|
    data_encoding = 'unknown'
 | 
						|
    file_encoding = 'unknown'
 | 
						|
 | 
						|
    def __init__(self, stream, encode, decode, Reader, Writer,
 | 
						|
                 errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamRecoder instance which implements a two-way
 | 
						|
            conversion: encode and decode work on the frontend (the
 | 
						|
            input to .read() and output of .write()) while
 | 
						|
            Reader and Writer work on the backend (reading and
 | 
						|
            writing to the stream).
 | 
						|
 | 
						|
            You can use these objects to do transparent direct
 | 
						|
            recodings from e.g. latin-1 to utf-8 and back.
 | 
						|
 | 
						|
            stream must be a file-like object.
 | 
						|
 | 
						|
            encode, decode must adhere to the Codec interface, Reader,
 | 
						|
            Writer must be factory functions or classes providing the
 | 
						|
            StreamReader, StreamWriter interface resp.
 | 
						|
 | 
						|
            encode and decode are needed for the frontend translation,
 | 
						|
            Reader and Writer for the backend translation. Unicode is
 | 
						|
            used as intermediate encoding.
 | 
						|
 | 
						|
            Error handling is done in the same way as defined for the
 | 
						|
            StreamWriter/Readers.
 | 
						|
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.encode = encode
 | 
						|
        self.decode = decode
 | 
						|
        self.reader = Reader(stream, errors)
 | 
						|
        self.writer = Writer(stream, errors)
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def read(self, size=-1):
 | 
						|
 | 
						|
        data = self.reader.read(size)
 | 
						|
        data, bytesencoded = self.encode(data, self.errors)
 | 
						|
        return data
 | 
						|
 | 
						|
    def readline(self, size=None):
 | 
						|
 | 
						|
        if size is None:
 | 
						|
            data = self.reader.readline()
 | 
						|
        else:
 | 
						|
            data = self.reader.readline(size)
 | 
						|
        data, bytesencoded = self.encode(data, self.errors)
 | 
						|
        return data
 | 
						|
 | 
						|
    def readlines(self, sizehint=None):
 | 
						|
 | 
						|
        data = self.reader.read()
 | 
						|
        data, bytesencoded = self.encode(data, self.errors)
 | 
						|
        return data.splitlines(1)
 | 
						|
 | 
						|
    def __next__(self):
 | 
						|
 | 
						|
        """ Return the next decoded line from the input stream."""
 | 
						|
        data = next(self.reader)
 | 
						|
        data, bytesencoded = self.encode(data, self.errors)
 | 
						|
        return data
 | 
						|
 | 
						|
    def __iter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def write(self, data):
 | 
						|
 | 
						|
        data, bytesdecoded = self.decode(data, self.errors)
 | 
						|
        return self.writer.write(data)
 | 
						|
 | 
						|
    def writelines(self, list):
 | 
						|
 | 
						|
        data = ''.join(list)
 | 
						|
        data, bytesdecoded = self.decode(data, self.errors)
 | 
						|
        return self.writer.write(data)
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        self.reader.reset()
 | 
						|
        self.writer.reset()
 | 
						|
 | 
						|
    def __getattr__(self, name,
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream, name)
 | 
						|
 | 
						|
    def __enter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def __exit__(self, type, value, tb):
 | 
						|
        self.stream.close()
 | 
						|
 | 
						|
### Shortcuts
 | 
						|
 | 
						|
def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 | 
						|
 | 
						|
    """ Open an encoded file using the given mode and return
 | 
						|
        a wrapped version providing transparent encoding/decoding.
 | 
						|
 | 
						|
        Note: The wrapped version will only accept the object format
 | 
						|
        defined by the codecs, i.e. Unicode objects for most builtin
 | 
						|
        codecs. Output is also codec dependent and will usually be
 | 
						|
        Unicode as well.
 | 
						|
 | 
						|
        Files are always opened in binary mode, even if no binary mode
 | 
						|
        was specified. This is done to avoid data loss due to encodings
 | 
						|
        using 8-bit values. The default file mode is 'rb' meaning to
 | 
						|
        open the file in binary read mode.
 | 
						|
 | 
						|
        encoding specifies the encoding which is to be used for the
 | 
						|
        file.
 | 
						|
 | 
						|
        errors may be given to define the error handling. It defaults
 | 
						|
        to 'strict' which causes ValueErrors to be raised in case an
 | 
						|
        encoding error occurs.
 | 
						|
 | 
						|
        buffering has the same meaning as for the builtin open() API.
 | 
						|
        It defaults to line buffered.
 | 
						|
 | 
						|
        The returned wrapped file object provides an extra attribute
 | 
						|
        .encoding which allows querying the used encoding. This
 | 
						|
        attribute is only available if an encoding was specified as
 | 
						|
        parameter.
 | 
						|
 | 
						|
    """
 | 
						|
    if encoding is not None and \
 | 
						|
       'b' not in mode:
 | 
						|
        # Force opening of the file in binary mode
 | 
						|
        mode = mode + 'b'
 | 
						|
    file = builtins.open(filename, mode, buffering)
 | 
						|
    if encoding is None:
 | 
						|
        return file
 | 
						|
    info = lookup(encoding)
 | 
						|
    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
 | 
						|
    # Add attributes to simplify introspection
 | 
						|
    srw.encoding = encoding
 | 
						|
    return srw
 | 
						|
 | 
						|
def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 | 
						|
 | 
						|
    """ Return a wrapped version of file which provides transparent
 | 
						|
        encoding translation.
 | 
						|
 | 
						|
        Strings written to the wrapped file are interpreted according
 | 
						|
        to the given data_encoding and then written to the original
 | 
						|
        file as string using file_encoding. The intermediate encoding
 | 
						|
        will usually be Unicode but depends on the specified codecs.
 | 
						|
 | 
						|
        Strings are read from the file using file_encoding and then
 | 
						|
        passed back to the caller as string using data_encoding.
 | 
						|
 | 
						|
        If file_encoding is not given, it defaults to data_encoding.
 | 
						|
 | 
						|
        errors may be given to define the error handling. It defaults
 | 
						|
        to 'strict' which causes ValueErrors to be raised in case an
 | 
						|
        encoding error occurs.
 | 
						|
 | 
						|
        The returned wrapped file object provides two extra attributes
 | 
						|
        .data_encoding and .file_encoding which reflect the given
 | 
						|
        parameters of the same name. The attributes can be used for
 | 
						|
        introspection by Python programs.
 | 
						|
 | 
						|
    """
 | 
						|
    if file_encoding is None:
 | 
						|
        file_encoding = data_encoding
 | 
						|
    data_info = lookup(data_encoding)
 | 
						|
    file_info = lookup(file_encoding)
 | 
						|
    sr = StreamRecoder(file, data_info.encode, data_info.decode,
 | 
						|
                       file_info.streamreader, file_info.streamwriter, errors)
 | 
						|
    # Add attributes to simplify introspection
 | 
						|
    sr.data_encoding = data_encoding
 | 
						|
    sr.file_encoding = file_encoding
 | 
						|
    return sr
 | 
						|
 | 
						|
### Helpers for codec lookup
 | 
						|
 | 
						|
def getencoder(encoding):
 | 
						|
 | 
						|
    """ Lookup up the codec for the given encoding and return
 | 
						|
        its encoder function.
 | 
						|
 | 
						|
        Raises a LookupError in case the encoding cannot be found.
 | 
						|
 | 
						|
    """
 | 
						|
    return lookup(encoding).encode
 | 
						|
 | 
						|
def getdecoder(encoding):
 | 
						|
 | 
						|
    """ Lookup up the codec for the given encoding and return
 | 
						|
        its decoder function.
 | 
						|
 | 
						|
        Raises a LookupError in case the encoding cannot be found.
 | 
						|
 | 
						|
    """
 | 
						|
    return lookup(encoding).decode
 | 
						|
 | 
						|
def getincrementalencoder(encoding):
 | 
						|
 | 
						|
    """ Lookup up the codec for the given encoding and return
 | 
						|
        its IncrementalEncoder class or factory function.
 | 
						|
 | 
						|
        Raises a LookupError in case the encoding cannot be found
 | 
						|
        or the codecs doesn't provide an incremental encoder.
 | 
						|
 | 
						|
    """
 | 
						|
    encoder = lookup(encoding).incrementalencoder
 | 
						|
    if encoder is None:
 | 
						|
        raise LookupError(encoding)
 | 
						|
    return encoder
 | 
						|
 | 
						|
def getincrementaldecoder(encoding):
 | 
						|
 | 
						|
    """ Lookup up the codec for the given encoding and return
 | 
						|
        its IncrementalDecoder class or factory function.
 | 
						|
 | 
						|
        Raises a LookupError in case the encoding cannot be found
 | 
						|
        or the codecs doesn't provide an incremental decoder.
 | 
						|
 | 
						|
    """
 | 
						|
    decoder = lookup(encoding).incrementaldecoder
 | 
						|
    if decoder is None:
 | 
						|
        raise LookupError(encoding)
 | 
						|
    return decoder
 | 
						|
 | 
						|
def getreader(encoding):
 | 
						|
 | 
						|
    """ Lookup up the codec for the given encoding and return
 | 
						|
        its StreamReader class or factory function.
 | 
						|
 | 
						|
        Raises a LookupError in case the encoding cannot be found.
 | 
						|
 | 
						|
    """
 | 
						|
    return lookup(encoding).streamreader
 | 
						|
 | 
						|
def getwriter(encoding):
 | 
						|
 | 
						|
    """ Lookup up the codec for the given encoding and return
 | 
						|
        its StreamWriter class or factory function.
 | 
						|
 | 
						|
        Raises a LookupError in case the encoding cannot be found.
 | 
						|
 | 
						|
    """
 | 
						|
    return lookup(encoding).streamwriter
 | 
						|
 | 
						|
def iterencode(iterator, encoding, errors='strict', **kwargs):
 | 
						|
    """
 | 
						|
    Encoding iterator.
 | 
						|
 | 
						|
    Encodes the input strings from the iterator using a IncrementalEncoder.
 | 
						|
 | 
						|
    errors and kwargs are passed through to the IncrementalEncoder
 | 
						|
    constructor.
 | 
						|
    """
 | 
						|
    encoder = getincrementalencoder(encoding)(errors, **kwargs)
 | 
						|
    for input in iterator:
 | 
						|
        output = encoder.encode(input)
 | 
						|
        if output:
 | 
						|
            yield output
 | 
						|
    output = encoder.encode("", True)
 | 
						|
    if output:
 | 
						|
        yield output
 | 
						|
 | 
						|
def iterdecode(iterator, encoding, errors='strict', **kwargs):
 | 
						|
    """
 | 
						|
    Decoding iterator.
 | 
						|
 | 
						|
    Decodes the input strings from the iterator using a IncrementalDecoder.
 | 
						|
 | 
						|
    errors and kwargs are passed through to the IncrementalDecoder
 | 
						|
    constructor.
 | 
						|
    """
 | 
						|
    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
 | 
						|
    for input in iterator:
 | 
						|
        output = decoder.decode(input)
 | 
						|
        if output:
 | 
						|
            yield output
 | 
						|
    output = decoder.decode(b"", True)
 | 
						|
    if output:
 | 
						|
        yield output
 | 
						|
 | 
						|
### Helpers for charmap-based codecs
 | 
						|
 | 
						|
def make_identity_dict(rng):
 | 
						|
 | 
						|
    """ make_identity_dict(rng) -> dict
 | 
						|
 | 
						|
        Return a dictionary where elements of the rng sequence are
 | 
						|
        mapped to themselves.
 | 
						|
 | 
						|
    """
 | 
						|
    res = {}
 | 
						|
    for i in rng:
 | 
						|
        res[i]=i
 | 
						|
    return res
 | 
						|
 | 
						|
def make_encoding_map(decoding_map):
 | 
						|
 | 
						|
    """ Creates an encoding map from a decoding map.
 | 
						|
 | 
						|
        If a target mapping in the decoding map occurs multiple
 | 
						|
        times, then that target is mapped to None (undefined mapping),
 | 
						|
        causing an exception when encountered by the charmap codec
 | 
						|
        during translation.
 | 
						|
 | 
						|
        One example where this happens is cp875.py which decodes
 | 
						|
        multiple character to \u001a.
 | 
						|
 | 
						|
    """
 | 
						|
    m = {}
 | 
						|
    for k,v in decoding_map.items():
 | 
						|
        if not v in m:
 | 
						|
            m[v] = k
 | 
						|
        else:
 | 
						|
            m[v] = None
 | 
						|
    return m
 | 
						|
 | 
						|
### error handlers
 | 
						|
 | 
						|
try:
 | 
						|
    strict_errors = lookup_error("strict")
 | 
						|
    ignore_errors = lookup_error("ignore")
 | 
						|
    replace_errors = lookup_error("replace")
 | 
						|
    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
 | 
						|
    backslashreplace_errors = lookup_error("backslashreplace")
 | 
						|
except LookupError:
 | 
						|
    # In --disable-unicode builds, these error handler are missing
 | 
						|
    strict_errors = None
 | 
						|
    ignore_errors = None
 | 
						|
    replace_errors = None
 | 
						|
    xmlcharrefreplace_errors = None
 | 
						|
    backslashreplace_errors = None
 | 
						|
 | 
						|
# Tell modulefinder that using codecs probably needs the encodings
 | 
						|
# package
 | 
						|
_false = 0
 | 
						|
if _false:
 | 
						|
    import encodings
 | 
						|
 | 
						|
### Tests
 | 
						|
 | 
						|
if __name__ == '__main__':
 | 
						|
 | 
						|
    # Make stdout translate Latin-1 output into UTF-8 output
 | 
						|
    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
 | 
						|
 | 
						|
    # Have stdin translate Latin-1 input into UTF-8 input
 | 
						|
    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
 |