mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 11:49:12 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			418 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			418 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
""" codecs -- Python Codec Registry, API and helpers.
 | 
						|
 | 
						|
 | 
						|
Written by Marc-Andre Lemburg (mal@lemburg.com).
 | 
						|
 | 
						|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
 | 
						|
 | 
						|
"""#"
 | 
						|
 | 
						|
import struct,types,__builtin__
 | 
						|
 | 
						|
### Registry and builtin stateless codec functions
 | 
						|
 | 
						|
try:
 | 
						|
    from _codecs import *
 | 
						|
except ImportError,why:
 | 
						|
    raise SystemError,\
 | 
						|
          'Failed to load the builtin codecs: %s' % why
 | 
						|
 | 
						|
### Constants
 | 
						|
 | 
						|
#
 | 
						|
# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
 | 
						|
#
 | 
						|
BOM = struct.pack('=H',0xFEFF)
 | 
						|
#
 | 
						|
BOM_BE = BOM32_BE = '\376\377'
 | 
						|
#	corresponds to Unicode U+FEFF in UTF-16 on big endian
 | 
						|
#	platforms == ZERO WIDTH NO-BREAK SPACE
 | 
						|
BOM_LE = BOM32_LE = '\377\376' 
 | 
						|
#	corresponds to Unicode U+FFFE in UTF-16 on little endian
 | 
						|
#	platforms == defined as being an illegal Unicode character
 | 
						|
 | 
						|
#
 | 
						|
# 64-bit Byte Order Marks
 | 
						|
#
 | 
						|
BOM64_BE = '\000\000\376\377'
 | 
						|
#	corresponds to Unicode U+0000FEFF in UCS-4
 | 
						|
BOM64_LE = '\377\376\000\000'
 | 
						|
#	corresponds to Unicode U+0000FFFE in UCS-4
 | 
						|
 | 
						|
 | 
						|
### Codec base classes (defining the API)
 | 
						|
 | 
						|
class Codec:
 | 
						|
 | 
						|
    """ Defines the interface for stateless encoders/decoders.
 | 
						|
 | 
						|
        The .encode()/.decode() methods may implement different error
 | 
						|
        handling schemes by providing the errors argument. These
 | 
						|
        string values are defined:
 | 
						|
 | 
						|
         'strict' - raise a ValueError error (or a subclass)
 | 
						|
         'ignore' - ignore the character and continue with the next
 | 
						|
         'replace' - replace with a suitable replacement character;
 | 
						|
                    Python will use the official U+FFFD REPLACEMENT
 | 
						|
                    CHARACTER for the builtin Unicode codecs.
 | 
						|
 | 
						|
    """
 | 
						|
    def encode(self,input,errors='strict'):
 | 
						|
        
 | 
						|
        """ Encodes the object input and returns a tuple (output
 | 
						|
            object, length consumed).
 | 
						|
 | 
						|
            errors defines the error handling to apply. It defaults to
 | 
						|
            'strict' handling.
 | 
						|
 | 
						|
            The method may not store state in the Codec instance. Use
 | 
						|
            StreamCodec for codecs which have to keep state in order to
 | 
						|
            make encoding/decoding efficient.
 | 
						|
 | 
						|
            The encoder must be able to handle zero length input and
 | 
						|
            return an empty object of the output object type in this
 | 
						|
            situation.
 | 
						|
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def decode(self,input,errors='strict'):
 | 
						|
 | 
						|
        """ Decodes the object input and returns a tuple (output
 | 
						|
            object, length consumed).
 | 
						|
 | 
						|
            input must be an object which provides the bf_getreadbuf
 | 
						|
            buffer slot. Python strings, buffer objects and memory
 | 
						|
            mapped files are examples of objects providing this slot.
 | 
						|
        
 | 
						|
            errors defines the error handling to apply. It defaults to
 | 
						|
            'strict' handling.
 | 
						|
 | 
						|
            The method may not store state in the Codec instance. Use
 | 
						|
            StreamCodec for codecs which have to keep state in order to
 | 
						|
            make encoding/decoding efficient.
 | 
						|
 | 
						|
            The decoder must be able to handle zero length input and
 | 
						|
            return an empty object of the output object type in this
 | 
						|
            situation.
 | 
						|
 | 
						|
        """ 
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
#
 | 
						|
# The StreamWriter and StreamReader class provide generic working
 | 
						|
# interfaces which can be used to implement new encodings submodules
 | 
						|
# very easily. See encodings/utf_8.py for an example on how this is
 | 
						|
# done.
 | 
						|
# 
 | 
						|
 | 
						|
class StreamWriter(Codec):
 | 
						|
 | 
						|
    def __init__(self,stream,errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamWriter instance.
 | 
						|
 | 
						|
            stream must be a file-like object open for writing
 | 
						|
            (binary) data.
 | 
						|
 | 
						|
            The StreamWriter may implement different error handling
 | 
						|
            schemes by providing the errors keyword argument. These
 | 
						|
            parameters are defined:
 | 
						|
 | 
						|
             'strict' - raise a ValueError (or a subclass)
 | 
						|
             'ignore' - ignore the character and continue with the next
 | 
						|
             'replace'- replace with a suitable replacement character
 | 
						|
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def write(self,object):
 | 
						|
 | 
						|
        """ Writes the object's contents encoded to self.stream.
 | 
						|
        """
 | 
						|
        data, consumed = self.encode(object,self.errors)
 | 
						|
        self.stream.write(data)
 | 
						|
 | 
						|
    # XXX .writelines() ?
 | 
						|
        
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        """ Flushes and resets the codec buffers used for keeping state.
 | 
						|
 | 
						|
            Calling this method should ensure that the data on the
 | 
						|
            output is put into a clean state, that allows appending
 | 
						|
            of new fresh data without having to rescan the whole
 | 
						|
            stream to recover state.
 | 
						|
 | 
						|
        """
 | 
						|
        pass
 | 
						|
 | 
						|
    def __getattr__(self,name,
 | 
						|
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream,name)
 | 
						|
 | 
						|
###
 | 
						|
 | 
						|
class StreamReader(Codec):
 | 
						|
 | 
						|
    def __init__(self,stream,errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamReader instance.
 | 
						|
 | 
						|
            stream must be a file-like object open for reading
 | 
						|
            (binary) data.
 | 
						|
 | 
						|
            The StreamReader may implement different error handling
 | 
						|
            schemes by providing the errors keyword argument. These
 | 
						|
            parameters are defined:
 | 
						|
 | 
						|
             'strict' - raise a ValueError (or a subclass)
 | 
						|
             'ignore' - ignore the character and continue with the next
 | 
						|
             'replace'- replace with a suitable replacement character;
 | 
						|
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def read(self,size=-1):
 | 
						|
 | 
						|
        """ Decodes data from the stream self.stream and returns the
 | 
						|
            resulting object.
 | 
						|
 | 
						|
            size indicates the approximate maximum number of bytes to
 | 
						|
            read from the stream for decoding purposes. The decoder
 | 
						|
            can modify this setting as appropriate. The default value
 | 
						|
            -1 indicates to read and decode as much as possible.  size
 | 
						|
            is intended to prevent having to decode huge files in one
 | 
						|
            step.
 | 
						|
 | 
						|
            The method should use a greedy read strategy meaning that
 | 
						|
            it should read as much data as is allowed within the
 | 
						|
            definition of the encoding and the given size, e.g.  if
 | 
						|
            optional encoding endings or state markers are available
 | 
						|
            on the stream, these should be read too.
 | 
						|
 | 
						|
        """
 | 
						|
        # Unsliced reading:
 | 
						|
        if size < 0:
 | 
						|
            return self.decode(self.stream.read())[0]
 | 
						|
        
 | 
						|
        # Sliced reading:
 | 
						|
        read = self.stream.read
 | 
						|
        decode = self.decode
 | 
						|
        data = read(size)
 | 
						|
        i = 0
 | 
						|
        while 1:
 | 
						|
            try:
 | 
						|
                object, decodedbytes = decode(data)
 | 
						|
            except ValueError,why:
 | 
						|
                # This method is slow but should work under pretty much
 | 
						|
                # all conditions; at most 10 tries are made
 | 
						|
                i = i + 1
 | 
						|
                newdata = read(1)
 | 
						|
                if not newdata or i > 10:
 | 
						|
                    raise
 | 
						|
                data = data + newdata
 | 
						|
            else:
 | 
						|
                return object
 | 
						|
 | 
						|
    # XXX .readline() and .readlines() (these are hard to implement
 | 
						|
    #     without using buffers for keeping read-ahead data)
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        """ Resets the codec buffers used for keeping state.
 | 
						|
 | 
						|
            Note that no stream repositioning should take place.
 | 
						|
            This method is primarely intended to be able to recover
 | 
						|
            from decoding errors.
 | 
						|
 | 
						|
        """
 | 
						|
        pass
 | 
						|
 | 
						|
    def __getattr__(self,name,
 | 
						|
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream,name)
 | 
						|
 | 
						|
###
 | 
						|
 | 
						|
class StreamReaderWriter:
 | 
						|
 | 
						|
    def __init__(self,stream,Reader,Writer,errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamReaderWriter instance.
 | 
						|
 | 
						|
            stream must be a Stream-like object.
 | 
						|
 | 
						|
            Reader, Writer must be factory functions or classes
 | 
						|
            providing the StreamReader, StreamWriter interface resp.
 | 
						|
 | 
						|
            Error handling is done in the same way as defined for the
 | 
						|
            StreamWriter/Readers.
 | 
						|
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.reader = Reader(stream, errors)
 | 
						|
        self.writer = Writer(stream, errors)
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def read(self,size=-1):
 | 
						|
 | 
						|
        return self.reader.read(size)
 | 
						|
 | 
						|
    def write(self,data):
 | 
						|
 | 
						|
        return self.writer.write(data)
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        self.reader.reset()
 | 
						|
        self.writer.reset()
 | 
						|
 | 
						|
    def __getattr__(self,name,
 | 
						|
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream,name)
 | 
						|
 | 
						|
###
 | 
						|
 | 
						|
class StreamRecoder:
 | 
						|
 | 
						|
    def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
 | 
						|
 | 
						|
        """ Creates a StreamRecoder instance which implements a two-way
 | 
						|
            conversion: encode and decode work on the frontend (the
 | 
						|
            input to .read() and output of .write()) while 
 | 
						|
            Reader and Writer work on the backend (reading and
 | 
						|
            writing to the stream).
 | 
						|
 | 
						|
            You can use these objects to do transparent direct
 | 
						|
            recodings from e.g. latin-1 to utf-8 and back.
 | 
						|
 | 
						|
            stream must be a file-like object.
 | 
						|
 | 
						|
            encode, decode must adhere to the Codec interface, Reader,
 | 
						|
            Writer must be factory functions or classes providing the
 | 
						|
            StreamReader, StreamWriter interface resp.
 | 
						|
 | 
						|
            encode and decode are needed for the frontend translation,
 | 
						|
            Reader and Writer for the backend translation. Unicode is
 | 
						|
            used as intermediate encoding.
 | 
						|
 | 
						|
            Error handling is done in the same way as defined for the
 | 
						|
            StreamWriter/Readers.
 | 
						|
 | 
						|
        """
 | 
						|
        self.stream = stream
 | 
						|
        self.encode = encode
 | 
						|
        self.decode = decode
 | 
						|
        self.reader = Reader(stream, errors)
 | 
						|
        self.writer = Writer(stream, errors)
 | 
						|
        self.errors = errors
 | 
						|
 | 
						|
    def read(self,size=-1):
 | 
						|
 | 
						|
        data = self.reader.read(size)
 | 
						|
        data, bytesencoded = self.encode(data, self.errors)
 | 
						|
        return data
 | 
						|
 | 
						|
    def write(self,data):
 | 
						|
 | 
						|
        data, bytesdecoded = self.decode(data, self.errors)
 | 
						|
        return self.writer.write(data)
 | 
						|
 | 
						|
    # .writelines(), .readline() and .readlines() ... see notes
 | 
						|
    # above.
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
 | 
						|
        self.reader.reset()
 | 
						|
        self.writer.reset()
 | 
						|
 | 
						|
    def __getattr__(self,name,
 | 
						|
 | 
						|
                    getattr=getattr):
 | 
						|
 | 
						|
        """ Inherit all other methods from the underlying stream.
 | 
						|
        """
 | 
						|
        return getattr(self.stream,name)
 | 
						|
 | 
						|
### Shortcuts
 | 
						|
 | 
						|
def open(filename, mode, encoding=None, errors='strict', buffering=1):
 | 
						|
 | 
						|
    """ Open an encoded file using the given mode and return
 | 
						|
        a wrapped version providing transparent encoding/decoding.
 | 
						|
 | 
						|
        Note: The wrapped version will only accept the object format
 | 
						|
        defined by the codecs, i.e. Unicode objects for most builtin
 | 
						|
        codecs. Output is also codec dependent and will usually by
 | 
						|
        Unicode as well.
 | 
						|
 | 
						|
        encoding specifies the encoding which is to be used for the
 | 
						|
        the file.
 | 
						|
 | 
						|
        errors may be given to define the error handling. It defaults
 | 
						|
        to 'strict' which causes ValueErrors to be raised in case an
 | 
						|
        encoding error occurs.
 | 
						|
 | 
						|
        buffering has the same meaning as for the builtin open() API.
 | 
						|
        It defaults to line buffered.
 | 
						|
 | 
						|
    """
 | 
						|
    if encoding is not None and \
 | 
						|
       'b' not in mode:
 | 
						|
        # Force opening of the file in binary mode
 | 
						|
        mode = mode + 'b'
 | 
						|
    file = __builtin__.open(filename, mode, buffering)
 | 
						|
    if encoding is None:
 | 
						|
        return file
 | 
						|
    (e,d,sr,sw) = lookup(encoding)
 | 
						|
    return StreamReaderWriter(file, sr, sw, errors)
 | 
						|
 | 
						|
def EncodedFile(file, input, output=None, errors='strict'):
 | 
						|
 | 
						|
    """ Return a wrapped version of file which provides transparent
 | 
						|
        encoding translation.
 | 
						|
 | 
						|
        Strings written to the wrapped file are interpreted according
 | 
						|
        to the given input encoding and then written to the original
 | 
						|
        file as string using the output encoding. The intermediate
 | 
						|
        encoding will usually be Unicode but depends on the specified
 | 
						|
        codecs.
 | 
						|
 | 
						|
        If output is not given, it defaults to input.
 | 
						|
 | 
						|
        errors may be given to define the error handling. It defaults
 | 
						|
        to 'strict' which causes ValueErrors to be raised in case an
 | 
						|
        encoding error occurs.
 | 
						|
 | 
						|
    """
 | 
						|
    if output is None:
 | 
						|
        output = input
 | 
						|
    encode, decode = lookup(input)[:2]
 | 
						|
    Reader, Writer = lookup(output)[2:]
 | 
						|
    return StreamRecoder(file,
 | 
						|
                         encode,decode,Reader,Writer,
 | 
						|
                         errors)
 | 
						|
 | 
						|
### Tests
 | 
						|
    
 | 
						|
if __name__ == '__main__':
 | 
						|
 | 
						|
    import sys
 | 
						|
    
 | 
						|
    # Make stdout translate Latin-1 into Unicode-Escape
 | 
						|
    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')
 |