Sync'ing with standalone email package 2.0.1. This adds support for

non-us-ascii character sets in headers and bodies.  Some API changes
(with DeprecationWarnings for the old APIs).  Better RFC-compliant
implementations of base64 and quoted-printable.

Updated test cases.  Documentation updates to follow (after I finish
writing them ;).
This commit is contained in:
Barry Warsaw 2002-04-10 21:01:31 +00:00
parent 68e69338ae
commit 409a4c08b5
20 changed files with 2209 additions and 143 deletions

327
Lib/email/Charset.py Normal file
View file

@ -0,0 +1,327 @@
# Copyright (C) 2001,2002 Python Software Foundation
# Author: che@debian.org (Ben Gertzfield)
from types import UnicodeType
from email.Encoders import encode_7or8bit
import email.base64MIME
import email.quopriMIME
# Flags for types of header encodings
QP = 1 # Quoted-Printable
BASE64 = 2 # Base64
# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
MISC_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
# Defaults
CHARSETS = {
# input header enc body enc output conv
'iso-8859-1': (QP, QP, None),
'iso-8859-2': (QP, QP, None),
'us-ascii': (None, None, None),
'big5': (BASE64, BASE64, None),
'gb2312': (BASE64, BASE64, None),
'euc-jp': (BASE64, None, 'iso-2022-jp'),
'shift_jis': (BASE64, None, 'iso-2022-jp'),
'iso-2022-jp': (BASE64, None, None),
'koi8-r': (BASE64, BASE64, None),
'utf-8': (BASE64, BASE64, 'utf-8'),
}
# Aliases for other commonly-used names for character sets. Map
# them to the real ones used in email.
ALIASES = {
'latin_1': 'iso-8859-1',
'latin-1': 'iso-8859-1',
'ascii': 'us-ascii',
}
# Map charsets to their Unicode codec strings. Note that the Japanese
# examples included below do not (yet) come with Python! They are available
# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
# The Chinese and Korean codecs are available from SourceForge:
#
# http://sourceforge.net/projects/python-codecs/
#
# although you'll need to check them out of cvs since they haven't been file
# released yet. You might also try to use
#
# http://www.freshports.org/port-description.php3?port=6702
#
# if you can get logged in. AFAICT, both the Chinese and Korean codecs are
# fairly experimental at this point.
CODEC_MAP = {
'euc-jp': 'japanese.euc-jp',
'iso-2022-jp': 'japanese.iso-2022-jp',
'shift_jis': 'japanese.shift_jis',
'gb2132': 'eucgb2312_cn',
'big5': 'big5_tw',
'utf-8': 'utf-8',
# Hack: We don't want *any* conversion for stuff marked us-ascii, as all
# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
# Let that stuff pass through without conversion to/from Unicode.
'us-ascii': None,
}
# Convenience functions for extending the above mappings
def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
"""Add charset properties to the global map.
charset is the input character set, and must be the canonical name of a
character set.
Optional header_enc and body_enc is either Charset.QP for
quoted-printable, Charset.BASE64 for base64 encoding, or None for no
encoding. It describes how message headers and message bodies in the
input charset are to be encoded. Default is no encoding.
Optional output_charset is the character set that the output should be
in. Conversions will proceed from input charset, to Unicode, to the
output charset when the method Charset.convert() is called. The default
is to output in the same character set as the input.
Both input_charset and output_charset must have Unicode codec entries in
the module's charset-to-codec mapping; use add_codec(charset, codecname)
to add codecs the module does not know about. See the codec module's
documentation for more information.
"""
CHARSETS[charset] = (header_enc, body_enc, output_charset)
def add_alias(alias, canonical):
"""Add a character set alias.
alias is the alias name, e.g. latin-1
canonical is the character set's canonical name, e.g. iso-8859-1
"""
ALIASES[alias] = canonical
def add_codec(charset, codecname):
"""Add a codec that map characters in the given charset to/from Unicode.
charset is the canonical name of a character set. codecname is the name
of a Python codec, as appropriate for the second argument to the unicode()
built-in, or to the .encode() method of a Unicode string.
"""
CODEC_MAP[charset] = codecname
class Charset:
"""Map character sets to their email properties.
This class provides information about the requirements imposed on email
for a specific character set. It also provides convenience routines for
converting between character sets, given the availability of the
applicable codecs. Given an character set, it will do its best to provide
information on how to use that character set in an email.
Certain character sets must be encoded with quoted-printable or base64
when used in email headers or bodies. Certain character sets must be
converted outright, and are not allowed in email. Instances of this
module expose the following information about a character set:
input_charset: The initial character set specified. Common aliases
are converted to their `official' email names (e.g. latin_1
is converted to iso-8859-1). Defaults to 7-bit us-ascii.
header_encoding: If the character set must be encoded before it can be
used in an email header, this attribute will be set to
Charset.QP (for quoted-printable) or Charset.BASE64 (for
base64 encoding). Otherwise, it will be None.
body_encoding: Same as header_encoding, but describes the encoding for the
mail message's body, which indeed may be different than the
header encoding.
output_charset: Some character sets must be converted before the can be
used in email headers or bodies. If the input_charset is
one of them, this attribute will contain the name of the
charset output will be converted to. Otherwise, it will
be None.
input_codec: The name of the Python codec used to convert the
input_charset to Unicode. If no conversion codec is
necessary, this attribute will be None.
output_codec: The name of the Python codec used to convert Unicode
to the output_charset. If no conversion codec is necessary,
this attribute will have the same value as the input_codec.
"""
def __init__(self, input_charset=DEFAULT_CHARSET):
# Set the input charset after filtering through the aliases
self.input_charset = ALIASES.get(input_charset, input_charset)
# We can try to guess which encoding and conversion to use by the
# charset_map dictionary. Try that first, but let the user override
# it.
henc, benc, conv = CHARSETS.get(self.input_charset,
(BASE64, BASE64, None))
# Set the attributes, allowing the arguments to override the default.
self.header_encoding = henc
self.body_encoding = benc
self.output_charset = ALIASES.get(conv, conv)
# Now set the codecs. If one isn't defined for input_charset,
# guess and try a Unicode codec with the same name as input_codec.
self.input_codec = CODEC_MAP.get(self.input_charset,
self.input_charset)
self.output_codec = CODEC_MAP.get(self.output_charset,
self.input_codec)
def __str__(self):
return self.input_charset.lower()
def __eq__(self, other):
return str(self) == str(other).lower()
def __ne__(self, other):
return not self.__eq__(other)
def get_body_encoding(self):
"""Return the content-transfer-encoding used for body encoding.
This is either the string `quoted-printable' or `base64' depending on
the encoding used, or it is a function in which case you should call
the function with a single argument, the Message object being
encoded. The function should then set the Content-Transfer-Encoding:
header itself to whatever is appropriate.
Returns "quoted-printable" if self.body_encoding is QP.
Returns "base64" if self.body_encoding is BASE64.
Returns "7bit" otherwise.
"""
if self.body_encoding == QP:
return 'quoted-printable'
elif self.body_encoding == BASE64:
return 'base64'
else:
return encode_7or8bit
def convert(self, s):
"""Convert a string from the input_codec to the output_codec."""
if self.input_codec <> self.output_codec:
return unicode(s, self.input_codec).encode(self.output_codec)
else:
return s
def to_splittable(self, s):
"""Convert a possibly multibyte string to a safely splittable format.
Uses the input_codec to try and convert the string to Unicode, so it
can be safely split on character boundaries (even for double-byte
characters).
Returns the string untouched if we don't know how to convert it to
Unicode with the input_charset.
Characters that could not be converted to Unicode will be replaced
with the Unicode replacement character U+FFFD.
"""
if isinstance(s, UnicodeType) or self.input_codec is None:
return s
try:
return unicode(s, self.input_codec, 'replace')
except LookupError:
# Input codec not installed on system, so return the original
# string unchanged.
return s
def from_splittable(self, ustr, to_output=1):
"""Convert a splittable string back into an encoded string.
Uses the proper codec to try and convert the string from
Unicode back into an encoded format. Return the string as-is
if it is not Unicode, or if it could not be encoded from
Unicode.
Characters that could not be converted from Unicode will be replaced
with an appropriate character (usually '?').
If to_output is true, uses output_codec to convert to an encoded
format. If to_output is false, uses input_codec. to_output defaults
to 1.
"""
if to_output:
codec = self.output_codec
else:
codec = self.input_codec
if not isinstance(ustr, UnicodeType) or codec is None:
return ustr
try:
return ustr.encode(codec, 'replace')
except LookupError:
# Output codec not installed
return ustr
def get_output_charset(self):
"""Return the output character set.
This is self.output_charset if that is set, otherwise it is
self.input_charset.
"""
return self.output_charset or self.input_charset
def encoded_header_len(self, s):
"""Return the length of the encoded header string."""
cset = self.get_output_charset()
# The len(s) of a 7bit encoding is len(s)
if self.header_encoding is BASE64:
return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
elif self.header_encoding is QP:
return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
else:
return len(s)
def header_encode(self, s, convert=0):
"""Header-encode a string, optionally converting it to output_charset.
If convert is true, the string will be converted from the input
charset to the output charset automatically. This is not useful for
multibyte character sets, which have line length issues (multibyte
characters must be split on a character, not a byte boundary); use the
high-level Header class to deal with these issues. convert defaults
to 0.
The type of encoding (base64 or quoted-printable) will be based on
self.header_encoding.
"""
cset = self.get_output_charset()
if convert:
s = self.convert(s)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
if self.header_encoding is BASE64:
return email.base64MIME.header_encode(s, cset)
elif self.header_encoding is QP:
return email.quopriMIME.header_encode(s, cset)
else:
return s
def body_encode(self, s, convert=1):
"""Body-encode a string and convert it to output_charset.
If convert is true (the default), the string will be converted from
the input charset to output charset automatically. Unlike
header_encode(), there are no issues with byte boundaries and
multibyte charsets in email bodies, so this is usually pretty safe.
The type of encoding (base64 or quoted-printable) will be based on
self.body_encoding.
"""
if convert:
s = self.convert(s)
# 7bit/8bit encodings return the string unchanged (module conversions)
if self.body_encoding is BASE64:
return email.base64MIME.body_encode(s)
elif self.header_encoding is QP:
return email.quopriMIME.body_encode(s)
else:
return s

View file

@ -1,4 +1,4 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Module containing encoding functions for Image.Image and Text.Text.
@ -11,7 +11,9 @@ from quopri import encodestring as _encodestring
# Helpers
def _qencode(s):
return _encodestring(s, quotetabs=1)
enc = _encodestring(s, quotetabs=1)
# Must encode spaces, which quopri.encodestring() doesn't do
return enc.replace(' ', '=20')
def _bencode(s):
@ -54,6 +56,10 @@ def encode_quopri(msg):
def encode_7or8bit(msg):
"""Set the Content-Transfer-Encoding: header to 7bit or 8bit."""
orig = msg.get_payload()
if orig is None:
# There's no payload. For backwards compatibility we use 7bit
msg['Content-Transfer-Encoding'] = '7bit'
return
# We play a trick to make this go fast. If encoding to ASCII succeeds, we
# know the data must be 7bit, otherwise treat it as 8bit.
try:

View file

@ -1,4 +1,4 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""email package exception classes.

View file

@ -1,4 +1,4 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Classes to generate plain text from a message object tree.
@ -166,30 +166,33 @@ class Generator:
return text
rtn = []
for line in text.split('\n'):
splitline = []
# Short lines can remain unchanged
if len(line.replace('\t', SPACE8)) <= maxheaderlen:
rtn.append(line)
SEMINLTAB.join(rtn)
splitline.append(line)
rtn.append(SEMINLTAB.join(splitline))
else:
oldlen = len(text)
oldlen = len(line)
# Try to break the line on semicolons, but if that doesn't
# work, try to split on folding whitespace.
while len(text) > maxheaderlen:
i = text.rfind(';', 0, maxheaderlen)
while len(line) > maxheaderlen:
i = line.rfind(';', 0, maxheaderlen)
if i < 0:
break
rtn.append(text[:i])
text = text[i+1:].lstrip()
if len(text) <> oldlen:
splitline.append(line[:i])
line = line[i+1:].lstrip()
if len(line) <> oldlen:
# Splitting on semis worked
rtn.append(text)
return SEMINLTAB.join(rtn)
splitline.append(line)
rtn.append(SEMINLTAB.join(splitline))
continue
# Splitting on semis didn't help, so try to split on
# whitespace.
parts = re.split(r'(\s+)', text)
parts = re.split(r'(\s+)', line)
# Watch out though for "Header: longnonsplittableline"
if parts[0].endswith(':') and len(parts) == 3:
return text
rtn.append(line)
continue
first = parts.pop(0)
sublines = [first]
acc = len(first)
@ -203,13 +206,14 @@ class Generator:
else:
# Split it here, but don't forget to ignore the
# next whitespace-only part
rtn.append(EMPTYSTRING.join(sublines))
splitline.append(EMPTYSTRING.join(sublines))
del parts[0]
first = parts.pop(0)
sublines = [first]
acc = len(first)
rtn.append(EMPTYSTRING.join(sublines))
return NLTAB.join(rtn)
splitline.append(EMPTYSTRING.join(sublines))
rtn.append(NLTAB.join(splitline))
return NL.join(rtn)
#
# Handlers for writing types and subtypes
@ -219,6 +223,9 @@ class Generator:
payload = msg.get_payload()
if payload is None:
return
cset = msg.get_charset()
if cset is not None:
payload = cset.body_encode(payload)
if not isinstance(payload, StringType):
raise TypeError, 'string payload expected: %s' % type(payload)
if self._mangle_from_:
@ -233,7 +240,18 @@ class Generator:
# together, and then make sure that the boundary we've chosen isn't
# present in the payload.
msgtexts = []
for part in msg.get_payload():
subparts = msg.get_payload()
if subparts is None:
# Nothing has every been attached
boundary = msg.get_boundary(failobj=_make_boundary())
print >> self._fp, '--' + boundary
print >> self._fp, '\n'
print >> self._fp, '--' + boundary + '--'
return
elif not isinstance(subparts, ListType):
# Scalar payload
subparts = [subparts]
for part in subparts:
s = StringIO()
g = self.__class__(s, self._mangle_from_, self.__maxheaderlen)
g(part, unixfrom=0)
@ -365,7 +383,7 @@ class DecodedGenerator(Generator):
# Helper
def _make_boundary(self, text=None):
def _make_boundary(text=None):
# Craft a random boundary. If text is given, ensure that the chosen
# boundary doesn't appear in the text.
boundary = ('=' * 15) + repr(random.random()).split('.')[1] + '=='

210
Lib/email/Header.py Normal file
View file

@ -0,0 +1,210 @@
# Copyright (C) 2002 Python Software Foundation
# Author: che@debian.org (Ben Gertzfield)
"""Header encoding and decoding functionality."""
import re
import email.quopriMIME
import email.base64MIME
from email.Charset import Charset
CRLFSPACE = '\r\n '
CRLF = '\r\n'
NLSPACE = '\n '
MAXLINELEN = 76
ENCODE = 1
DECODE = 2
# Match encoded-word strings in the form =?charset?q?Hello_World?=
ecre = re.compile(r'''
=\? # literal =?
(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
\? # literal ?
(?P<encoding>[qb]) # either a "q" or a "b", case insensitive
\? # literal ?
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
\?= # literal ?=
''', re.VERBOSE | re.IGNORECASE)
# Helpers
_max_append = email.quopriMIME._max_append
def decode_header(header):
"""Decode a message header value without converting charset.
Returns a list of (decoded_string, charset) pairs containing each of the
decoded parts of the header. Charset is None for non-encoded parts of the
header, otherwise a lower-case string containing the name of the character
set specified in the encoded string.
"""
# If no encoding, just return the header
header = str(header)
if not ecre.search(header):
return [(header, None)]
decoded = []
dec = ''
for line in header.splitlines():
# This line might not have an encoding in it
if not ecre.search(line):
decoded.append((line, None))
continue
parts = ecre.split(line)
while parts:
unenc = parts.pop(0).strip()
if unenc:
# Should we continue a long line?
if decoded and decoded[-1][1] is None:
decoded[-1] = (decoded[-1][0] + dec, None)
else:
decoded.append((unenc, None))
if parts:
charset, encoding = [s.lower() for s in parts[0:2]]
encoded = parts[2]
dec = ''
if encoding == 'q':
dec = email.quopriMIME.header_decode(encoded)
elif encoding == 'b':
dec = email.base64MIME.decode(encoded)
else:
dec = encoded
if decoded and decoded[-1][1] == charset:
decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
else:
decoded.append((dec, charset))
del parts[0:3]
return decoded
class Header:
def __init__(self, s, charset=None, maxlinelen=MAXLINELEN,
header_name=None):
"""Create a MIME-compliant header that can contain many languages.
Specify the initial header value in s. Specify its character set as a
Charset object in the charset argument. If none, a default Charset
instance will be used.
You can later append to the header with append(s, charset) below;
charset does not have to be the same as the one initially specified
here. In fact, it's optional, and if not given, defaults to the
charset specified in the constructor.
The maximum line length can either be specified by maxlinelen, or you
can pass in the name of the header field (e.g. "Subject") to let this
class guess the best line length to use to prevent wrapping. The
default maxlinelen is 76.
"""
if charset is None:
charset = Charset()
self._charset = charset
# BAW: I believe `chunks' and `maxlinelen' should be non-public.
self._chunks = []
self.append(s, charset)
self._maxlinelen = maxlinelen
if header_name is not None:
self.guess_maxlinelen(header_name)
def __str__(self):
"""A synonym for self.encode()."""
return self.encode()
def guess_maxlinelen(self, s=None):
"""Guess the maximum length to make each header line.
Given a header name (e.g. "Subject"), set this header's maximum line
length to an appropriate length to avoid line wrapping. If s is not
given, return the previous maximum line length and don't set it.
Returns the new maximum line length.
"""
# BAW: is this semantic necessary?
if s is not None:
self._maxlinelen = MAXLINELEN - len(s) - 2
return self._maxlinelen
def append(self, s, charset=None):
"""Append string s with Charset charset to the MIME header.
charset defaults to the one given in the class constructor.
"""
if charset is None:
charset = self._charset
self._chunks.append((s, charset))
def _split(self, s, charset):
# Split up a header safely for use with encode_chunks. BAW: this
# appears to be a private convenience method.
splittable = charset.to_splittable(s)
encoded = charset.from_splittable(splittable)
if charset.encoded_header_len(encoded) < self._maxlinelen:
return [(encoded, charset)]
else:
# Divide and conquer. BAW: halfway depends on integer division.
# When porting to Python 2.2, use the // operator.
halfway = len(splittable) // 2
first = charset.from_splittable(splittable[:halfway], 0)
last = charset.from_splittable(splittable[halfway:], 0)
return self._split(first, charset) + self._split(last, charset)
def encode(self):
"""Encode a message header, possibly converting charset and encoding.
There are many issues involved in converting a given string for use in
an email header. Only certain character sets are readable in most
email clients, and as header strings can only contain a subset of
7-bit ASCII, care must be taken to properly convert and encode (with
Base64 or quoted-printable) header strings. In addition, there is a
75-character length limit on any given encoded header field, so
line-wrapping must be performed, even with double-byte character sets.
This method will do its best to convert the string to the correct
character set used in email, and encode and line wrap it safely with
the appropriate scheme for that character set.
If the given charset is not known or an error occurs during
conversion, this function will return the header untouched.
"""
newchunks = []
for s, charset in self._chunks:
newchunks += self._split(s, charset)
self._chunks = newchunks
return self.encode_chunks()
def encode_chunks(self):
"""MIME-encode a header with many different charsets and/or encodings.
Given a list of pairs (string, charset), return a MIME-encoded string
suitable for use in a header field. Each pair may have different
charsets and/or encodings, and the resulting header will accurately
reflect each setting.
Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
(no encoding).
Each pair will be represented on a separate line; the resulting string
will be in the format:
"=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
=?charset2?b?SvxyZ2VuIEL2aW5n?="
"""
chunks = []
for header, charset in self._chunks:
if charset is None:
_max_append(chunks, header, self._maxlinelen, ' ')
else:
_max_append(chunks, charset.header_encode(header, 0),
self._maxlinelen, ' ')
return NLSPACE.join(chunks)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Various types of useful iterators and generators.

View file

@ -1,4 +1,4 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Base class for MIME specializations.

View file

@ -1,4 +1,4 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Class representing image/* type MIME documents.

View file

@ -1,4 +1,4 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Class representing message/* MIME documents.

View file

@ -1,9 +1,10 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Class representing text/* type MIME documents.
"""
import warnings
import MIMEBase
from Encoders import encode_7or8bit
@ -13,7 +14,7 @@ class MIMEText(MIMEBase.MIMEBase):
"""Class for generating text/* type MIME documents."""
def __init__(self, _text, _subtype='plain', _charset='us-ascii',
_encoder=encode_7or8bit):
_encoder=None):
"""Create a text/* type MIME document.
_text is the string for this message object. If the text does not end
@ -22,20 +23,26 @@ class MIMEText(MIMEBase.MIMEBase):
_subtype is the MIME sub content type, defaulting to "plain".
_charset is the character set parameter added to the Content-Type:
header. This defaults to "us-ascii".
header. This defaults to "us-ascii". Note that as a side-effect, the
Content-Transfer-Encoding: header will also be set.
_encoder is a function which will perform the actual encoding for
transport of the text data. It takes one argument, which is this
Text instance. It should use get_payload() and set_payload() to
change the payload to the encoded form. It should also add any
Content-Transfer-Encoding: or other headers to the message as
necessary. The default encoding doesn't actually modify the payload,
but it does set Content-Transfer-Encoding: to either `7bit' or `8bit'
as appropriate.
The use of the _encoder is deprecated. The encoding of the payload,
and the setting of the character set parameter now happens implicitly
based on the _charset argument. If _encoder is supplied, then a
DeprecationWarning is used, and the _encoder functionality may
override any header settings indicated by _charset. This is probably
not what you want.
"""
MIMEBase.MIMEBase.__init__(self, 'text', _subtype,
**{'charset': _charset})
if _text and _text[-1] <> '\n':
_text += '\n'
self.set_payload(_text)
_encoder(self)
self.set_payload(_text, _charset)
if _encoder is not None:
warnings.warn('_encoder argument is obsolete.',
DeprecationWarning, 2)
# Because set_payload() with a _charset will set its own
# Content-Transfer-Encoding: header, we need to delete the
# existing one or will end up with two of them. :(
del self['content-transfer-encoding']
_encoder(self)

View file

@ -1,23 +1,47 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Basic message object for the email package object model.
"""
from __future__ import generators
import re
import base64
import quopri
import warnings
from cStringIO import StringIO
from types import ListType
from types import ListType, StringType
# Intrapackage imports
import Errors
import Utils
import Charset
SEMISPACE = '; '
# Regular expression used to split header parameters. BAW: this may be too
# simple. It isn't strictly RFC 2045 (section 5.1) compliant, but it catches
# most headers found in the wild. We may eventually need a full fledged
# parser eventually.
paramre = re.compile(r'\s*;\s*')
# Regular expression that matches `special' characters in parameters, the
# existance of which force quoting of the parameter value.
tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
# Helper function
def _formatparam(param, value=None, quote=1):
"""Convenience function to format and return a key=value pair.
Will quote the value if needed or if quote is true.
"""
if value is not None and len(value) > 0:
# BAW: Please check this. I think that if quote is set it should
# force quoting even if not necessary.
if quote or tspecials.search(value):
return '%s="%s"' % (param, Utils.quote(value))
else:
return '%s=%s' % (param, value)
else:
return param
@ -39,6 +63,7 @@ class Message:
self._headers = []
self._unixfrom = None
self._payload = None
self._charset = None
# Defaults for multipart messages
self.preamble = self.epilogue = None
@ -83,6 +108,8 @@ class Message:
If the current payload is empty, then the current payload will be made
a scalar, set to the given value.
"""
warnings.warn('add_payload() is deprecated, use attach() instead.',
DeprecationWarning, 2)
if self._payload is None:
self._payload = payload
elif type(self._payload) is ListType:
@ -93,8 +120,18 @@ class Message:
else:
self._payload = [self._payload, payload]
# A useful synonym
attach = add_payload
def attach(self, payload):
"""Add the given payload to the current payload.
The current payload will always be a list of objects after this method
is called. If you want to set the payload to a scalar object
(e.g. because you're attaching a message/rfc822 subpart), use
set_payload() instead.
"""
if self._payload is None:
self._payload = [payload]
else:
self._payload.append(payload)
def get_payload(self, i=None, decode=0):
"""Return the current payload exactly as is.
@ -128,10 +165,58 @@ class Message:
return payload
def set_payload(self, payload):
"""Set the payload to the given value."""
self._payload = payload
def set_payload(self, payload, charset=None):
"""Set the payload to the given value.
Optionally set the charset, which must be a Charset instance."""
self._payload = payload
if charset is not None:
self.set_charset(charset)
def set_charset(self, charset):
"""Set the charset of the payload to a given character set.
charset can be a string or a Charset object. If it is a string, it
will be converted to a Charset object by calling Charset's
constructor. If charset is None, the charset parameter will be
removed from the Content-Type: field. Anything else will generate a
TypeError.
The message will be assumed to be a text message encoded with
charset.input_charset. It will be converted to charset.output_charset
and encoded properly, if needed, when generating the plain text
representation of the message. MIME headers (MIME-Version,
Content-Type, Content-Transfer-Encoding) will be added as needed.
"""
if charset is None:
self.del_param('charset')
self._charset = None
return
if isinstance(charset, StringType):
charset = Charset.Charset(charset)
if not isinstance(charset, Charset.Charset):
raise TypeError, charset
# BAW: should we accept strings that can serve as arguments to the
# Charset constructor?
self._charset = charset
if not self.has_key('MIME-Version'):
self.add_header('MIME-Version', '1.0')
if not self.has_key('Content-Type'):
self.add_header('Content-Type', 'text/plain',
charset=charset.get_output_charset())
else:
self.set_param('charset', charset.get_output_charset())
if not self.has_key('Content-Transfer-Encoding'):
cte = charset.get_body_encoding()
if callable(cte):
cte(self)
else:
self.add_header('Content-Transfer-Encoding', cte)
def get_charset(self):
"""Return the Charset object associated with the message's payload."""
return self._charset
#
# MAPPING INTERFACE (partial)
#
@ -257,7 +342,7 @@ class Message:
if v is None:
parts.append(k.replace('_', '-'))
else:
parts.append('%s="%s"' % (k.replace('_', '-'), v))
parts.append(_formatparam(k.replace('_', '-'), v))
if _value is not None:
parts.insert(0, _value)
self._headers.append((_name, SEMISPACE.join(parts)))
@ -308,6 +393,8 @@ class Message:
for p in paramre.split(value):
try:
name, val = p.split('=', 1)
name = name.rstrip()
val = val.lstrip()
except ValueError:
# Must have been a bare attribute
name = p
@ -315,26 +402,29 @@ class Message:
params.append((name, val))
return params
def get_params(self, failobj=None, header='content-type'):
def get_params(self, failobj=None, header='content-type', unquote=1):
"""Return the message's Content-Type: parameters, as a list.
The elements of the returned list are 2-tuples of key/value pairs, as
split on the `=' sign. The left hand side of the `=' is the key,
while the right hand side is the value. If there is no `=' sign in
the parameter the value is the empty string. The value is always
unquoted.
unquoted, unless unquote is set to a false value.
Optional failobj is the object to return if there is no Content-Type:
header. Optional header is the header to search instead of
Content-Type:
Content-Type:.
"""
missing = []
params = self._get_params_preserve(missing, header)
if params is missing:
return failobj
return [(k, Utils.unquote(v)) for k, v in params]
if unquote:
return [(k, Utils.unquote(v)) for k, v in params]
else:
return params
def get_param(self, param, failobj=None, header='content-type'):
def get_param(self, param, failobj=None, header='content-type', unquote=1):
"""Return the parameter value if found in the Content-Type: header.
Optional failobj is the object to return if there is no Content-Type:
@ -342,15 +432,112 @@ class Message:
Content-Type:
Parameter keys are always compared case insensitively. Values are
always unquoted.
always unquoted, unless unquote is set to a false value.
"""
if not self.has_key(header):
return failobj
for k, v in self._get_params_preserve(failobj, header):
if k.lower() == param.lower():
return Utils.unquote(v)
if unquote:
return Utils.unquote(v)
else:
return v
return failobj
def set_param(self, param, value, header='Content-Type', requote=1):
"""Set a parameter in the Content-Type: header.
If the parameter already exists in the header, its value will be
replaced with the new value.
If header is Content-Type: and has not yet been defined in this
message, it will be set to "text/plain" and the new parameter and
value will be appended, as per RFC 2045.
An alternate header can specified in the header argument, and
all parameters will be quoted as appropriate unless requote is
set to a false value.
"""
if not self.has_key(header) and header.lower() == 'content-type':
ctype = 'text/plain'
else:
ctype = self.get(header)
if not self.get_param(param, header=header):
if not ctype:
ctype = _formatparam(param, value, requote)
else:
ctype = SEMISPACE.join(
[ctype, _formatparam(param, value, requote)])
else:
ctype = ''
for old_param, old_value in self.get_params(header=header,
unquote=requote):
append_param = ''
if old_param.lower() == param.lower():
append_param = _formatparam(param, value, requote)
else:
append_param = _formatparam(old_param, old_value, requote)
if not ctype:
ctype = append_param
else:
ctype = SEMISPACE.join([ctype, append_param])
if ctype <> self.get(header):
del self[header]
self[header] = ctype
def del_param(self, param, header='content-type', requote=1):
"""Remove the given parameter completely from the Content-Type header.
The header will be re-written in place without param or its value.
All values will be quoted as appropriate unless requote is set to a
false value.
"""
if not self.has_key(header):
return
new_ctype = ''
for p, v in self.get_params(header, unquote=requote):
if p.lower() <> param.lower():
if not new_ctype:
new_ctype = _formatparam(p, v, requote)
else:
new_ctype = SEMISPACE.join([new_ctype,
_formatparam(p, v, requote)])
if new_ctype <> self.get(header):
del self[header]
self[header] = new_ctype
def set_type(self, type, header='Content-Type', requote=1):
"""Set the main type and subtype for the Content-Type: header.
type must be a string in the form "maintype/subtype", otherwise a
ValueError is raised.
This method replaces the Content-Type: header, keeping all the
parameters in place. If requote is false, this leaves the existing
header's quoting as is. Otherwise, the parameters will be quoted (the
default).
An alternate header can be specified in the header argument. When the
Content-Type: header is set, we'll always also add a MIME-Version:
header.
"""
# BAW: should we be strict?
if not type.count('/') == 1:
raise ValueError
# Set the Content-Type: you get a MIME-Version:
if header.lower() == 'content-type':
del self['mime-version']
self['MIME-Version'] = '1.0'
if not self.has_key(header):
self[header] = type
return
params = self.get_params(header, unquote=requote)
del self[header]
self[header] = type
# Skip the first param; it's the old type.
for p, v in params[1:]:
self.set_param(p, v, header, requote)
def get_filename(self, failobj=None):
"""Return the filename associated with the payload if present.

View file

@ -51,9 +51,16 @@ class Parser:
lastvalue = []
lineno = 0
while 1:
line = fp.readline()[:-1]
if not line or not line.strip():
# Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant
# continuation lines.
line = fp.readline()
if not line:
break
line = line.splitlines()[0]
if not line:
break
# Ignore the trailing newline
lineno += 1
# Check for initial Unix From_ line
if line.startswith('From '):
@ -63,7 +70,6 @@ class Parser:
else:
raise Errors.HeaderParseError(
'Unix-from in headers after first rfc822 header')
#
# Header continuation line
if line[0] in ' \t':
if not lastheader:
@ -134,11 +140,11 @@ class Parser:
msgobj = self.parsestr(part)
container.preamble = preamble
container.epilogue = epilogue
# Ensure that the container's payload is a list
if not isinstance(container.get_payload(), ListType):
container.set_payload([msgobj])
else:
container.add_payload(msgobj)
container.attach(msgobj)
elif container.get_main_type() == 'multipart':
# Very bad. A message is a multipart with no boundary!
raise Errors.BoundaryError(
'multipart message with no defined boundary')
elif container.get_type() == 'message/delivery-status':
# This special kind of type contains blocks of headers separated
# by a blank line. We'll represent each header block as a
@ -160,9 +166,9 @@ class Parser:
except Errors.HeaderParseError:
msg = self._class()
self._parsebody(msg, fp)
container.add_payload(msg)
container.set_payload(msg)
else:
container.add_payload(fp.read())
container.set_payload(fp.read())

View file

@ -1,16 +1,26 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""Miscellaneous utilities.
"""
import time
import socket
import re
import random
import os
import warnings
from cStringIO import StringIO
from types import ListType
from rfc822 import unquote, quote, parseaddr
from rfc822 import dump_address_pair
from rfc822 import unquote, quote
from rfc822 import AddrlistClass as _AddrlistClass
from rfc822 import parsedate_tz, parsedate, mktime_tz
from rfc822 import mktime_tz
# We need wormarounds for bugs in these methods in older Pythons (see below)
from rfc822 import parsedate as _parsedate
from rfc822 import parsedate_tz as _parsedate_tz
from rfc822 import parseaddr as _parseaddr
from quopri import decodestring as _qdecode
import base64
@ -20,6 +30,10 @@ from Encoders import _bencode, _qencode
COMMASPACE = ', '
UEMPTYSTRING = u''
CRLF = '\r\n'
specialsre = re.compile(r'[][\()<>@,:;".]')
escapesre = re.compile(r'[][\()"]')
@ -43,6 +57,41 @@ def _bdecode(s):
return value
def fix_eols(s):
"""Replace all line-ending characters with \r\n."""
# Fix newlines with no preceding carriage return
s = re.sub(r'(?<!\r)\n', CRLF, s)
# Fix carriage returns with no following newline
s = re.sub(r'\r(?!\n)', CRLF, s)
return s
def formataddr(pair):
"""The inverse of parseaddr(), this takes a 2-tuple of the form
(realname, email_address) and returns the string value suitable
for an RFC 2822 From:, To: or Cc:.
If the first element of pair is false, then the second element is
returned unmodified.
"""
name, address = pair
if name:
quotes = ''
if specialsre.search(name):
quotes = '"'
name = escapesre.sub(r'\\\g<0>', name)
return '%s%s%s <%s>' % (quotes, name, quotes, address)
return address
# For backwards compatibility
def dump_address_pair(pair):
warnings.warn('Use email.Utils.formataddr() instead',
DeprecationWarning, 2)
return formataddr(pair)
def getaddresses(fieldvalues):
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
@ -64,30 +113,26 @@ ecre = re.compile(r'''
def decode(s):
"""Return a decoded string according to RFC 2047, as a unicode string."""
"""Return a decoded string according to RFC 2047, as a unicode string.
NOTE: This function is deprecated. Use Header.decode_header() instead.
"""
warnings.warn('Use Header.decode_header() instead.', DeprecationWarning, 2)
# Intra-package import here to avoid circular import problems.
from Header import decode_header
L = decode_header(s)
if not isinstance(L, ListType):
# s wasn't decoded
return s
rtn = []
parts = ecre.split(s, 1)
while parts:
# If there are less than 4 parts, it can't be encoded and we're done
if len(parts) < 5:
rtn.extend(parts)
break
# The first element is any non-encoded leading text
rtn.append(parts[0])
charset = parts[1]
encoding = parts[2].lower()
atom = parts[3]
# The next chunk to decode should be in parts[4]
parts = ecre.split(parts[4])
# The encoding must be either `q' or `b', case-insensitive
if encoding == 'q':
func = _qdecode
elif encoding == 'b':
func = _bdecode
for atom, charset in L:
if charset is None:
rtn.append(atom)
else:
func = _identity
# Decode and get the unicode in the charset
rtn.append(unicode(func(atom), charset))
# Convert the string to Unicode using the given encoding. Leave
# Unicode conversion errors to strict.
rtn.append(unicode(atom, charset))
# Now that we've decoded everything, we just need to join all the parts
# together into the final string.
return UEMPTYSTRING.join(rtn)
@ -96,6 +141,7 @@ def decode(s):
def encode(s, charset='iso-8859-1', encoding='q'):
"""Encode a string according to RFC 2047."""
warnings.warn('Use Header.Header.encode() instead.', DeprecationWarning, 2)
encoding = encoding.lower()
if encoding == 'q':
estr = _qencode(s)
@ -150,3 +196,48 @@ def formatdate(timeval=None, localtime=0):
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
now[0], now[3], now[4], now[5],
zone)
def make_msgid(idstring=None):
"""Returns a string suitable for RFC 2822 compliant Message-ID:, e.g:
<20020201195627.33539.96671@nightshade.la.mastaler.com>
Optional idstring if given is a string used to strengthen the
uniqueness of the Message-ID, otherwise an empty string is used.
"""
timeval = time.time()
utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
pid = os.getpid()
randint = random.randrange(100000)
if idstring is None:
idstring = ''
else:
idstring = '.' + idstring
idhost = socket.getfqdn()
msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
return msgid
# These functions are in the standalone mimelib version only because they've
# subsequently been fixed in the latest Python versions. We use this to worm
# around broken older Pythons.
def parsedate(data):
if not data:
return None
return _parsedate(data)
def parsedate_tz(data):
if not data:
return None
return _parsedate_tz(data)
def parseaddr(addr):
realname, emailaddr = _parseaddr(addr)
if realname == '' and emailaddr is None:
return '', ''
return realname, emailaddr

View file

@ -1,14 +1,16 @@
# Copyright (C) 2001 Python Software Foundation
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""A package for parsing, handling, and generating email messages.
"""
__version__ = '1.0'
__version__ = '2.0'
__all__ = ['Encoders',
__all__ = ['Charset',
'Encoders',
'Errors',
'Generator',
'Header',
'Iterators',
'MIMEAudio',
'MIMEBase',
@ -18,6 +20,8 @@ __all__ = ['Encoders',
'Message',
'Parser',
'Utils',
'base64MIME',
'quopriMIME',
'message_from_string',
'message_from_file',
]

174
Lib/email/base64MIME.py Normal file
View file

@ -0,0 +1,174 @@
# Copyright (C) 2002 Python Software Foundation
# Author: che@debian.org (Ben Gertzfield)
"""Base64 content transfer encoding per RFCs 2045-2047.
This module handles the content transfer encoding method defined in RFC 2045
to encode arbitrary 8-bit data using the three 8-bit bytes in four 7-bit
characters encoding known as Base64.
It is used in the MIME standards for email to attach images, audio, and text
using some 8-bit character sets to messages.
This module provides an interface to encode and decode both headers and bodies
with Base64 encoding.
RFC 2045 defines a method for including character set information in an
`encoded-word' in a header. This method is commonly used for 8-bit real names
in To:, From:, Cc:, etc. fields, as well as Subject: lines.
This module does not do the line wrapping or end-of-line character conversion
necessary for proper internationalized headers; it only does dumb encoding and
decoding. To deal with the various line wrapping issues, use the email.Header
module.
"""
import re
from binascii import b2a_base64, a2b_base64
from email.Utils import fix_eols
CRLF = '\r\n'
NL = '\n'
EMPTYSTRING = ''
# See also Charset.py
MISC_LEN = 7
# Helpers
def base64_len(s):
"""Return the length of s when it is encoded with base64."""
groups_of_3, leftover = divmod(len(s), 3)
# 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
# Thanks, Tim!
n = groups_of_3 * 4
if leftover:
n += 4
return n
def header_encode(header, charset='iso-8859-1', keep_eols=0, maxlinelen=76,
eol=NL):
"""Encode a single header line with Base64 encoding in a given charset.
Defined in RFC 2045, this Base64 encoding is identical to normal Base64
encoding, except that each line must be intelligently wrapped (respecting
the Base64 encoding), and subsequent lines must start with a space.
charset names the character set to use to encode the header. It defaults
to iso-8859-1.
End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
to the canonical email line separator \\r\\n unless the keep_eols
parameter is set to true (the default is false).
Each line of the header will be terminated in the value of eol, which
defaults to "\\n". Set this to "\\r\\n" if you are using the result of
this function directly in email.
The resulting string will be in the form:
"=?charset?b?WW/5ciBtYXp66XLrIHf8eiBhIGhhbXBzdGHuciBBIFlv+XIgbWF6euly?=\\n
=?charset?b?6yB3/HogYSBoYW1wc3Rh7nIgQkMgWW/5ciBtYXp66XLrIHf8eiBhIGhh?="
with each line wrapped at, at most, maxlinelen characters (defaults to 76
characters).
"""
# Return empty headers unchanged
if not header:
return header
if not keep_eols:
header = fix_eols(header)
# Base64 encode each line, in encoded chunks no greater than maxlinelen in
# length, after the RFC chrome is added in.
base64ed = []
max_encoded = maxlinelen - len(charset) - MISC_LEN
max_unencoded = max_encoded * 3 / 4
# BAW: Ben's original code used a step of max_unencoded, but I think it
# ought to be max_encoded. Otherwise, where's max_encoded used? I'm
# still not sure what the
for i in range(0, len(header), max_unencoded):
base64ed.append(b2a_base64(header[i:i+max_unencoded]))
# Now add the RFC chrome to each encoded chunk
lines = []
for line in base64ed:
# Ignore the last character of each line if it is a newline
if line[-1] == NL:
line = line[:-1]
# Add the chrome
lines.append('=?%s?b?%s?=' % (charset, line))
# Glue the lines together and return it. BAW: should we be able to
# specify the leading whitespace in the joiner?
joiner = eol + ' '
return joiner.join(lines)
def encode(s, binary=1, maxlinelen=76, eol=NL):
"""Encode a string with base64.
Each line will be wrapped at, at most, maxlinelen characters (defaults to
76 characters).
If binary is false, end-of-line characters will be converted to the
canonical email end-of-line sequence \\r\\n. Otherwise they will be left
verbatim (this is the default).
Each line of encoded text will end with eol, which defaults to "\\n". Set
this to "\r\n" if you will be using the result of this function directly
in an email.
"""
if not s:
return s
if not binary:
s = fix_eols(s)
encvec = []
max_unencoded = maxlinelen * 3 / 4
for i in range(0, len(s), max_unencoded):
# BAW: should encode() inherit b2a_base64()'s dubious behavior in
# adding a newline to the encoded string?
enc = b2a_base64(s[i:i + max_unencoded])
if enc[-1] == NL and eol <> NL:
enc = enc[:-1] + eol
encvec.append(enc)
return EMPTYSTRING.join(encvec)
# For convenience and backwards compatibility w/ standard base64 module
body_encode = encode
encodestring = encode
def decode(s, convert_eols=None):
"""Decode a raw base64 string.
If convert_eols is set to a string value, all canonical email linefeeds,
e.g. "\\r\\n", in the decoded text will be converted to the value of
convert_eols. os.linesep is a good choice for convert_eols if you are
decoding a text attachment.
This function does not parse a full MIME header value encoded with
base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
level email.Header class for that functionality.
"""
if not s:
return s
dec = a2b_base64(s)
if convert_eols:
return dec.replace(CRLF, convert_eols)
return dec
# For convenience and backwards compatibility w/ standard base64 module
body_decode = decode
decodestring = decode

312
Lib/email/quopriMIME.py Normal file
View file

@ -0,0 +1,312 @@
# Copyright (C) 2001,2002 Python Software Foundation
# Author: che@debian.org (Ben Gertzfield)
"""Quoted-printable content transfer encoding per RFCs 2045-2047.
This module handles the content transfer encoding method defined in RFC 2045
to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
safely encode text that is in a character set similar to the 7-bit US ASCII
character set, but that includes some 8-bit characters that are normally not
allowed in email bodies or headers.
Quoted-printable is very space-inefficient for encoding binary files; use the
email.base64MIME module for that instead.
This module provides an interface to encode and decode both headers and bodies
with quoted-printable encoding.
RFC 2045 defines a method for including character set information in an
`encoded-word' in a header. This method is commonly used for 8-bit real names
in To:/From:/Cc: etc. fields, as well as Subject: lines.
This module does not do the line wrapping or end-of-line character
conversion necessary for proper internationalized headers; it only
does dumb encoding and decoding. To deal with the various line
wrapping issues, use the email.Header module.
"""
import re
from string import hexdigits
from email.Utils import fix_eols
CRLF = '\r\n'
NL = '\n'
# See also Charset.py
MISC_LEN = 7
hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')
bqre = re.compile(r'[^ !-<>-~\t]')
# Helpers
def header_quopri_check(c):
"""Return true if the character should be escaped with header quopri."""
return hqre.match(c) and 1
def body_quopri_check(c):
"""Return true if the character should be escaped with body quopri."""
return bqre.match(c) and 1
def header_quopri_len(s):
"""Return the length of str when it is encoded with header quopri."""
count = 0
for c in s:
if hqre.match(c):
count += 3
else:
count += 1
return count
def body_quopri_len(str):
"""Return the length of str when it is encoded with body quopri."""
count = 0
for c in str:
if bqre.match(c):
count += 3
else:
count += 1
return count
def _max_append(L, s, maxlen, extra=''):
if not L:
L.append(s)
elif len(L[-1]) + len(s) < maxlen:
L[-1] += extra + s
else:
L.append(s)
def unquote(s):
"""Turn a string in the form =AB to the ASCII character with value 0xab"""
return chr(int(s[1:3], 16))
def quote(c):
return "=%02X" % ord(c)
def header_encode(header, charset="iso-8859-1", keep_eols=0, maxlinelen=76,
eol=NL):
"""Encode a single header line with quoted-printable (like) encoding.
Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
used specifically for email header fields to allow charsets with mostly 7
bit characters (and some 8 bit) to remain more or less readable in non-RFC
2045 aware mail clients.
charset names the character set to use to encode the header. It defaults
to iso-8859-1.
The resulting string will be in the form:
"=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n
=?charset?q?Silly_=C8nglish_Kn=EEghts?="
with each line wrapped safely at, at most, maxlinelen characters (defaults
to 76 characters).
End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
to the canonical email line separator \\r\\n unless the keep_eols
parameter is set to true (the default is false).
Each line of the header will be terminated in the value of eol, which
defaults to "\\n". Set this to "\\r\\n" if you are using the result of
this function directly in email.
"""
# Return empty headers unchanged
if not header:
return header
if not keep_eols:
header = fix_eols(header)
# Quopri encode each line, in encoded chunks no greater than maxlinelen in
# lenght, after the RFC chrome is added in.
quoted = []
max_encoded = maxlinelen - len(charset) - MISC_LEN
for c in header:
# Space may be represented as _ instead of =20 for readability
if c == ' ':
_max_append(quoted, '_', max_encoded)
# These characters can be included verbatim
elif not hqre.match(c):
_max_append(quoted, c, max_encoded)
# Otherwise, replace with hex value like =E2
else:
_max_append(quoted, "=%02X" % ord(c), max_encoded)
# Now add the RFC chrome to each encoded chunk and glue the chunks
# together. BAW: should we be able to specify the leading whitespace in
# the joiner?
joiner = eol + ' '
return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])
def encode(body, binary=0, maxlinelen=76, eol=NL):
"""Encode with quoted-printable, wrapping at maxlinelen characters.
If binary is false (the default), end-of-line characters will be converted
to the canonical email end-of-line sequence \\r\\n. Otherwise they will
be left verbatim.
Each line of encoded text will end with eol, which defaults to "\\n". Set
this to "\\r\\n" if you will be using the result of this function directly
in an email.
Each line will be wrapped at, at most, maxlinelen characters (defaults to
76 characters). Long lines will have the `soft linefeed' quoted-printable
character "=" appended to them, so the decoded text will be identical to
the original text.
"""
if not body:
return body
if not binary:
body = fix_eols(body)
# BAW: We're accumulating the body text by string concatenation. That
# can't be very efficient, but I don't have time now to rewrite it. It
# just feels like this algorithm could be more efficient.
encoded_body = ''
lineno = -1
# Preserve line endings here so we can check later to see an eol needs to
# be added to the output later.
lines = body.splitlines(1)
for line in lines:
# But strip off line-endings for processing this line.
if line.endswith(CRLF):
line = line[:-2]
elif line[-1] in CRLF:
line = line[:-1]
lineno += 1
encoded_line = ''
prev = None
linelen = len(line)
# Now we need to examine every character to see if it needs to be
# quopri encoded. BAW: again, string concatenation is inefficient.
for j in range(linelen):
c = line[j]
prev = c
if bqre.match(c):
c = quote(c)
elif j+1 == linelen:
# Check for whitespace at end of line; special case
if c not in ' \t':
encoded_line += c
prev = c
continue
# Check to see to see if the line has reached its maximum length
if len(encoded_line) + len(c) >= maxlinelen:
encoded_body += encoded_line + '=' + eol
encoded_line = ''
encoded_line += c
# Now at end of line..
if prev and prev in ' \t':
# Special case for whitespace at end of file
if lineno+1 == len(lines):
prev = quote(prev)
if len(encoded_line) + len(prev) > maxlinelen:
encoded_body += encoded_line + '=' + eol + prev
else:
encoded_body += encoded_line + prev
# Just normal whitespace at end of line
else:
encoded_body += encoded_line + prev + '=' + eol
encoded_line = ''
# Now look at the line we just finished and it has a line ending, we
# need to add eol to the end of the line.
if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
encoded_body += encoded_line + eol
else:
encoded_body += encoded_line
encoded_line = ''
return encoded_body
# For convenience and backwards compatibility w/ standard base64 module
body_encode = encode
encodestring = encode
# BAW: I'm not sure if the intent was for the signature of this function to be
# the same as base64MIME.decode() or not...
def decode(encoded, eol=NL):
"""Decode a quoted-printable string.
Lines are separated with eol, which defaults to \\n.
"""
if not encoded:
return encoded
# BAW: see comment in encode() above. Again, we're building up the
# decoded string with string concatenation, which could be done much more
# efficiently.
decoded = ''
for line in encoded.splitlines():
line = line.rstrip()
if not line:
decoded += eol
continue
i = 0
n = len(line)
while i < n:
c = line[i]
if c <> '=':
decoded += c
i += 1
# Otherwise, c == "=". Are we at the end of the line? If so, add
# a soft line break.
elif i+1 == n:
i += 1
continue
# Decode if in form =AB
elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
decoded += unquote(line[i:i+3])
i += 3
# Otherwise, not in form =AB, pass literally
else:
decoded += c
i += 1
if i == n:
decoded += eol
# Special case if original string did not end with eol
if encoded[-1] <> eol and decoded[-1] == eol:
decoded = decoded[:-1]
return decoded
# For convenience and backwards compatibility w/ standard base64 module
body_decode = decode
decodestring = decode
def _unquote_match(match):
"""Turn a match in the form =AB to the ASCII character with value 0xab"""
s = match.group(0)
return unquote(s)
# Header decoding is done a bit differently
def header_decode(s):
"""Decode a string encoded with RFC 2045 MIME header `Q' encoding.
This function does not parse a full MIME header value encoded with
quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
the high level email.Header class for that functionality.
"""
s = s.replace('_', ' ')
return re.sub(r'=\w{2}', _unquote_match, s)