This patch changes the way the string .encode() method works slightly

and introduces a new method .decode().

The major change is that strg.encode() will no longer try to convert
Unicode returns from the codec into a string, but instead pass along
the Unicode object as-is. The same is now true for all other codec
return types. The underlying C APIs were changed accordingly.

Note that even though this does have the potential of breaking
existing code, the chances are low since conversion from Unicode
previously took place using the default encoding which is normally
set to ASCII rendering this auto-conversion mechanism useless for
most Unicode encodings.

The good news is that you can now use .encode() and .decode() with
much greater ease and that the door was opened for better accessibility
of the builtin codecs.

As demonstration of the new feature, the patch includes a few new
codecs which allow string to string encoding and decoding (rot13,
hex, zip, uu, base64).

Written by Marc-Andre Lemburg. Copyright assigned to the PSF.
This commit is contained in:
Marc-André Lemburg 2001-05-15 12:00:02 +00:00
parent 2e0a654f6e
commit 2d9204199f
11 changed files with 594 additions and 39 deletions

View file

@ -79,4 +79,13 @@ aliases = {
'tis260': 'tactis',
'sjis': 'shift_jis',
# Content transfer/compression encodings
'rot13': 'rot_13',
'base64': 'base64_codec',
'base_64': 'base64_codec',
'zlib': 'zlib_codec',
'zip': 'zlib_codec',
'hex': 'hex_codec',
'uu': 'uu_codec',
}

View file

@ -0,0 +1,60 @@
""" Python 'base64_codec' Codec - base64 content transfer encoding
Unlike most of the other codecs which target Unicode, this codec
will return Python string objects for both encode and decode.
Written by Marc-Andre Lemburg (mal@lemburg.com).
"""
import codecs, base64
### Codec APIs
def base64_encode(input,errors='strict'):
""" Encodes the object input and returns a tuple (output
object, length consumed).
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
"""
assert errors == 'strict'
output = base64.encodestring(input)
return (output, len(input))
def base64_decode(input,errors='strict'):
""" Decodes the object input and returns a tuple (output
object, length consumed).
input must be an object which provides the bf_getreadbuf
buffer slot. Python strings, buffer objects and memory
mapped files are examples of objects providing this slot.
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
"""
assert errors == 'strict'
output = base64.decodestring(input)
return (output, len(input))
class Codec(codecs.Codec):
encode = base64_encode
decode = base64_decode
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (base64_encode,base64_decode,StreamReader,StreamWriter)

View file

@ -0,0 +1,60 @@
""" Python 'hex_codec' Codec - 2-digit hex content transfer encoding
Unlike most of the other codecs which target Unicode, this codec
will return Python string objects for both encode and decode.
Written by Marc-Andre Lemburg (mal@lemburg.com).
"""
import codecs, binascii
### Codec APIs
def hex_encode(input,errors='strict'):
""" Encodes the object input and returns a tuple (output
object, length consumed).
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
"""
assert errors == 'strict'
output = binascii.b2a_hex(input)
return (output, len(input))
def hex_decode(input,errors='strict'):
""" Decodes the object input and returns a tuple (output
object, length consumed).
input must be an object which provides the bf_getreadbuf
buffer slot. Python strings, buffer objects and memory
mapped files are examples of objects providing this slot.
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
"""
assert errors == 'strict'
output = binascii.a2b_hex(input)
return (output, len(input))
class Codec(codecs.Codec):
encode = hex_encode
decode = hex_decode
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (hex_encode,hex_decode,StreamReader,StreamWriter)

107
Lib/encodings/rot_13.py Normal file
View file

@ -0,0 +1,107 @@
#!/usr/local/bin/python2.1
""" Python Character Mapping Codec for ROT13.
See http://ucsub.colorado.edu/~kominek/rot13/ for details.
Written by Marc-Andre Lemburg (mal@lemburg.com).
"""#"
import codecs
### Codec APIs
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'):
return codecs.charmap_decode(input,errors,decoding_map)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x0041: 0x004e,
0x0042: 0x004f,
0x0043: 0x0050,
0x0044: 0x0051,
0x0045: 0x0052,
0x0046: 0x0053,
0x0047: 0x0054,
0x0048: 0x0055,
0x0049: 0x0056,
0x004a: 0x0057,
0x004b: 0x0058,
0x004c: 0x0059,
0x004d: 0x005a,
0x004e: 0x0041,
0x004f: 0x0042,
0x0050: 0x0043,
0x0051: 0x0044,
0x0052: 0x0045,
0x0053: 0x0046,
0x0054: 0x0047,
0x0055: 0x0048,
0x0056: 0x0049,
0x0057: 0x004a,
0x0058: 0x004b,
0x0059: 0x004c,
0x005a: 0x004d,
0x0061: 0x006e,
0x0062: 0x006f,
0x0063: 0x0070,
0x0064: 0x0071,
0x0065: 0x0072,
0x0066: 0x0073,
0x0067: 0x0074,
0x0068: 0x0075,
0x0069: 0x0076,
0x006a: 0x0077,
0x006b: 0x0078,
0x006c: 0x0079,
0x006d: 0x007a,
0x006e: 0x0061,
0x006f: 0x0062,
0x0070: 0x0063,
0x0071: 0x0064,
0x0072: 0x0065,
0x0073: 0x0066,
0x0074: 0x0067,
0x0075: 0x0068,
0x0076: 0x0069,
0x0077: 0x006a,
0x0078: 0x006b,
0x0079: 0x006c,
0x007a: 0x006d,
})
### Encoding Map
encoding_map = {}
for k,v in decoding_map.items():
encoding_map[v] = k
### Filter API
def rot13(infile, outfile):
outfile.write(infile.read().encode('rot-13'))
if __name__ == '__main__':
import sys
rot13(sys.stdin, sys.stdout)

110
Lib/encodings/uu_codec.py Normal file
View file

@ -0,0 +1,110 @@
""" Python 'uu_codec' Codec - UU content transfer encoding
Unlike most of the other codecs which target Unicode, this codec
will return Python string objects for both encode and decode.
Written by Marc-Andre Lemburg (mal@lemburg.com). Some details were
adapted from uu.py which was written by Lance Ellinghouse and
modified by Jack Jansen and Fredrik Lundh.
"""
import codecs, binascii
### Codec APIs
def uu_encode(input,errors='strict',filename='<data>',mode=0666):
""" Encodes the object input and returns a tuple (output
object, length consumed).
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
"""
assert errors == 'strict'
from cStringIO import StringIO
from binascii import b2a_uu
infile = StringIO(input)
outfile = StringIO()
read = infile.read
write = outfile.write
# Encode
write('begin %o %s\n' % (mode & 0777, filename))
chunk = read(45)
while chunk:
write(b2a_uu(chunk))
chunk = read(45)
write(' \nend\n')
return (outfile.getvalue(), len(input))
def uu_decode(input,errors='strict'):
""" Decodes the object input and returns a tuple (output
object, length consumed).
input must be an object which provides the bf_getreadbuf
buffer slot. Python strings, buffer objects and memory
mapped files are examples of objects providing this slot.
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
Note: filename and file mode information in the input data is
ignored.
"""
assert errors == 'strict'
from cStringIO import StringIO
from binascii import a2b_uu
infile = StringIO(input)
outfile = StringIO()
readline = infile.readline
write = outfile.write
# Find start of encoded data
while 1:
s = readline()
if not s:
raise ValueError, 'Missing "begin" line in input data'
if s[:5] == 'begin':
break
# Decode
while 1:
s = readline()
if not s or \
s == 'end\n':
break
try:
data = a2b_uu(s)
except binascii.Error, v:
# Workaround for broken uuencoders by /Fredrik Lundh
nbytes = (((ord(s[0])-32) & 63) * 4 + 5) / 3
data = a2b_uu(s[:nbytes])
#sys.stderr.write("Warning: %s\n" % str(v))
write(data)
if not s:
raise ValueError, 'Truncated input data'
return (outfile.getvalue(), len(input))
class Codec(codecs.Codec):
encode = uu_encode
decode = uu_decode
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (uu_encode,uu_decode,StreamReader,StreamWriter)

View file

@ -0,0 +1,61 @@
""" Python 'zlib_codec' Codec - zlib compression encoding
Unlike most of the other codecs which target Unicode, this codec
will return Python string objects for both encode and decode.
Written by Marc-Andre Lemburg (mal@lemburg.com).
"""
import codecs
import zlib # this codec needs the optional zlib module !
### Codec APIs
def zlib_encode(input,errors='strict'):
""" Encodes the object input and returns a tuple (output
object, length consumed).
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
"""
assert errors == 'strict'
output = zlib.compress(input)
return (output, len(input))
def zlib_decode(input,errors='strict'):
""" Decodes the object input and returns a tuple (output
object, length consumed).
input must be an object which provides the bf_getreadbuf
buffer slot. Python strings, buffer objects and memory
mapped files are examples of objects providing this slot.
errors defines the error handling to apply. It defaults to
'strict' handling which is the only currently supported
error handling for this codec.
"""
assert errors == 'strict'
output = zlib.decompress(input)
return (output, len(input))
class Codec(codecs.Codec):
encode = zlib_encode
decode = zlib_decode
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (zlib_encode,zlib_decode,StreamReader,StreamWriter)