#18891: Complete new provisional email API.

This adds EmailMessage and, MIMEPart subclasses of Message
with new API methods, and a ContentManager class used by
the new methods.  Also a new policy setting, content_manager.

Patch was reviewed by Stephen J. Turnbull and Serhiy Storchaka,
and reflects their feedback.

I will ideally add some examples of using the new API to the
documentation before the final release.
This commit is contained in:
R David Murray 2013-10-16 22:48:40 -04:00
parent 1a16288197
commit 3da240fd01
15 changed files with 2539 additions and 26 deletions

249
Lib/email/contentmanager.py Normal file
View file

@ -0,0 +1,249 @@
import binascii
import email.charset
import email.message
import email.errors
from email import quoprimime
class ContentManager:
def __init__(self):
self.get_handlers = {}
self.set_handlers = {}
def add_get_handler(self, key, handler):
self.get_handlers[key] = handler
def get_content(self, msg, *args, **kw):
content_type = msg.get_content_type()
if content_type in self.get_handlers:
return self.get_handlers[content_type](msg, *args, **kw)
maintype = msg.get_content_maintype()
if maintype in self.get_handlers:
return self.get_handlers[maintype](msg, *args, **kw)
if '' in self.get_handlers:
return self.get_handlers[''](msg, *args, **kw)
raise KeyError(content_type)
def add_set_handler(self, typekey, handler):
self.set_handlers[typekey] = handler
def set_content(self, msg, obj, *args, **kw):
if msg.get_content_maintype() == 'multipart':
# XXX: is this error a good idea or not? We can remove it later,
# but we can't add it later, so do it for now.
raise TypeError("set_content not valid on multipart")
handler = self._find_set_handler(msg, obj)
msg.clear_content()
handler(msg, obj, *args, **kw)
def _find_set_handler(self, msg, obj):
full_path_for_error = None
for typ in type(obj).__mro__:
if typ in self.set_handlers:
return self.set_handlers[typ]
qname = typ.__qualname__
modname = getattr(typ, '__module__', '')
full_path = '.'.join((modname, qname)) if modname else qname
if full_path_for_error is None:
full_path_for_error = full_path
if full_path in self.set_handlers:
return self.set_handlers[full_path]
if qname in self.set_handlers:
return self.set_handlers[qname]
name = typ.__name__
if name in self.set_handlers:
return self.set_handlers[name]
if None in self.set_handlers:
return self.set_handlers[None]
raise KeyError(full_path_for_error)
raw_data_manager = ContentManager()
def get_text_content(msg, errors='replace'):
content = msg.get_payload(decode=True)
charset = msg.get_param('charset', 'ASCII')
return content.decode(charset, errors=errors)
raw_data_manager.add_get_handler('text', get_text_content)
def get_non_text_content(msg):
return msg.get_payload(decode=True)
for maintype in 'audio image video application'.split():
raw_data_manager.add_get_handler(maintype, get_non_text_content)
def get_message_content(msg):
return msg.get_payload(0)
for subtype in 'rfc822 external-body'.split():
raw_data_manager.add_get_handler('message/'+subtype, get_message_content)
def get_and_fixup_unknown_message_content(msg):
# If we don't understand a message subtype, we are supposed to treat it as
# if it were application/octet-stream, per
# tools.ietf.org/html/rfc2046#section-5.2.4. Feedparser doesn't do that,
# so do our best to fix things up. Note that it is *not* appropriate to
# model message/partial content as Message objects, so they are handled
# here as well. (How to reassemble them is out of scope for this comment :)
return bytes(msg.get_payload(0))
raw_data_manager.add_get_handler('message',
get_and_fixup_unknown_message_content)
def _prepare_set(msg, maintype, subtype, headers):
msg['Content-Type'] = '/'.join((maintype, subtype))
if headers:
if not hasattr(headers[0], 'name'):
mp = msg.policy
headers = [mp.header_factory(*mp.header_source_parse([header]))
for header in headers]
try:
for header in headers:
if header.defects:
raise header.defects[0]
msg[header.name] = header
except email.errors.HeaderDefect as exc:
raise ValueError("Invalid header: {}".format(
header.fold(policy=msg.policy))) from exc
def _finalize_set(msg, disposition, filename, cid, params):
if disposition is None and filename is not None:
disposition = 'attachment'
if disposition is not None:
msg['Content-Disposition'] = disposition
if filename is not None:
msg.set_param('filename',
filename,
header='Content-Disposition',
replace=True)
if cid is not None:
msg['Content-ID'] = cid
if params is not None:
for key, value in params.items():
msg.set_param(key, value)
# XXX: This is a cleaned-up version of base64mime.body_encode. It would
# be nice to drop both this and quoprimime.body_encode in favor of
# enhanced binascii routines that accepted a max_line_length parameter.
def _encode_base64(data, max_line_length):
encoded_lines = []
unencoded_bytes_per_line = max_line_length * 3 // 4
for i in range(0, len(data), unencoded_bytes_per_line):
thisline = data[i:i+unencoded_bytes_per_line]
encoded_lines.append(binascii.b2a_base64(thisline).decode('ascii'))
return ''.join(encoded_lines)
def _encode_text(string, charset, cte, policy):
lines = string.encode(charset).splitlines()
linesep = policy.linesep.encode('ascii')
def embeded_body(lines): return linesep.join(lines) + linesep
def normal_body(lines): return b'\n'.join(lines) + b'\n'
if cte==None:
# Use heuristics to decide on the "best" encoding.
try:
return '7bit', normal_body(lines).decode('ascii')
except UnicodeDecodeError:
pass
if (policy.cte_type == '8bit' and
max(len(x) for x in lines) <= policy.max_line_length):
return '8bit', normal_body(lines).decode('ascii', 'surrogateescape')
sniff = embeded_body(lines[:10])
sniff_qp = quoprimime.body_encode(sniff.decode('latin-1'),
policy.max_line_length)
sniff_base64 = binascii.b2a_base64(sniff)
# This is a little unfair to qp; it includes lineseps, base64 doesn't.
if len(sniff_qp) > len(sniff_base64):
cte = 'base64'
else:
cte = 'quoted-printable'
if len(lines) <= 10:
return cte, sniff_qp
if cte == '7bit':
data = normal_body(lines).decode('ascii')
elif cte == '8bit':
data = normal_body(lines).decode('ascii', 'surrogateescape')
elif cte == 'quoted-printable':
data = quoprimime.body_encode(normal_body(lines).decode('latin-1'),
policy.max_line_length)
elif cte == 'base64':
data = _encode_base64(embeded_body(lines), policy.max_line_length)
else:
raise ValueError("Unknown content transfer encoding {}".format(cte))
return cte, data
def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None,
disposition=None, filename=None, cid=None,
params=None, headers=None):
_prepare_set(msg, 'text', subtype, headers)
cte, payload = _encode_text(string, charset, cte, msg.policy)
msg.set_payload(payload)
msg.set_param('charset',
email.charset.ALIASES.get(charset, charset),
replace=True)
msg['Content-Transfer-Encoding'] = cte
_finalize_set(msg, disposition, filename, cid, params)
raw_data_manager.add_set_handler(str, set_text_content)
def set_message_content(msg, message, subtype="rfc822", cte=None,
disposition=None, filename=None, cid=None,
params=None, headers=None):
if subtype == 'partial':
raise ValueError("message/partial is not supported for Message objects")
if subtype == 'rfc822':
if cte not in (None, '7bit', '8bit', 'binary'):
# http://tools.ietf.org/html/rfc2046#section-5.2.1 mandate.
raise ValueError(
"message/rfc822 parts do not support cte={}".format(cte))
# 8bit will get coerced on serialization if policy.cte_type='7bit'. We
# may end up claiming 8bit when it isn't needed, but the only negative
# result of that should be a gateway that needs to coerce to 7bit
# having to look through the whole embedded message to discover whether
# or not it actually has to do anything.
cte = '8bit' if cte is None else cte
elif subtype == 'external-body':
if cte not in (None, '7bit'):
# http://tools.ietf.org/html/rfc2046#section-5.2.3 mandate.
raise ValueError(
"message/external-body parts do not support cte={}".format(cte))
cte = '7bit'
elif cte is None:
# http://tools.ietf.org/html/rfc2046#section-5.2.4 says all future
# subtypes should be restricted to 7bit, so assume that.
cte = '7bit'
_prepare_set(msg, 'message', subtype, headers)
msg.set_payload([message])
msg['Content-Transfer-Encoding'] = cte
_finalize_set(msg, disposition, filename, cid, params)
raw_data_manager.add_set_handler(email.message.Message, set_message_content)
def set_bytes_content(msg, data, maintype, subtype, cte='base64',
disposition=None, filename=None, cid=None,
params=None, headers=None):
_prepare_set(msg, maintype, subtype, headers)
if cte == 'base64':
data = _encode_base64(data, max_line_length=msg.policy.max_line_length)
elif cte == 'quoted-printable':
# XXX: quoprimime.body_encode won't encode newline characters in data,
# so we can't use it. This means max_line_length is ignored. Another
# bug to fix later. (Note: encoders.quopri is broken on line ends.)
data = binascii.b2a_qp(data, istext=False, header=False, quotetabs=True)
data = data.decode('ascii')
elif cte == '7bit':
# Make sure it really is only ASCII. The early warning here seems
# worth the overhead...if you care write your own content manager :).
data.encode('ascii')
elif cte in ('8bit', 'binary'):
data = data.decode('ascii', 'surrogateescape')
msg.set_payload(data)
msg['Content-Transfer-Encoding'] = cte
_finalize_set(msg, disposition, filename, cid, params)
for typ in (bytes, bytearray, memoryview):
raw_data_manager.add_set_handler(typ, set_bytes_content)

View file

@ -8,8 +8,6 @@ __all__ = ['Message']
import re
import uu
import base64
import binascii
from io import BytesIO, StringIO
# Intrapackage imports
@ -679,7 +677,7 @@ class Message:
return failobj
def set_param(self, param, value, header='Content-Type', requote=True,
charset=None, language=''):
charset=None, language='', replace=False):
"""Set a parameter in the Content-Type header.
If the parameter already exists in the header, its value will be
@ -723,8 +721,11 @@ class Message:
else:
ctype = SEMISPACE.join([ctype, append_param])
if ctype != self.get(header):
del self[header]
self[header] = ctype
if replace:
self.replace_header(header, ctype)
else:
del self[header]
self[header] = ctype
def del_param(self, param, header='content-type', requote=True):
"""Remove the given parameter completely from the Content-Type header.
@ -905,3 +906,208 @@ class Message:
# I.e. def walk(self): ...
from email.iterators import walk
class MIMEPart(Message):
def __init__(self, policy=None):
if policy is None:
from email.policy import default
policy = default
Message.__init__(self, policy)
@property
def is_attachment(self):
c_d = self.get('content-disposition')
if c_d is None:
return False
return c_d.lower() == 'attachment'
def _find_body(self, part, preferencelist):
if part.is_attachment:
return
maintype, subtype = part.get_content_type().split('/')
if maintype == 'text':
if subtype in preferencelist:
yield (preferencelist.index(subtype), part)
return
if maintype != 'multipart':
return
if subtype != 'related':
for subpart in part.iter_parts():
yield from self._find_body(subpart, preferencelist)
return
if 'related' in preferencelist:
yield (preferencelist.index('related'), part)
candidate = None
start = part.get_param('start')
if start:
for subpart in part.iter_parts():
if subpart['content-id'] == start:
candidate = subpart
break
if candidate is None:
subparts = part.get_payload()
candidate = subparts[0] if subparts else None
if candidate is not None:
yield from self._find_body(candidate, preferencelist)
def get_body(self, preferencelist=('related', 'html', 'plain')):
"""Return best candidate mime part for display as 'body' of message.
Do a depth first search, starting with self, looking for the first part
matching each of the items in preferencelist, and return the part
corresponding to the first item that has a match, or None if no items
have a match. If 'related' is not included in preferencelist, consider
the root part of any multipart/related encountered as a candidate
match. Ignore parts with 'Content-Disposition: attachment'.
"""
best_prio = len(preferencelist)
body = None
for prio, part in self._find_body(self, preferencelist):
if prio < best_prio:
best_prio = prio
body = part
if prio == 0:
break
return body
_body_types = {('text', 'plain'),
('text', 'html'),
('multipart', 'related'),
('multipart', 'alternative')}
def iter_attachments(self):
"""Return an iterator over the non-main parts of a multipart.
Skip the first of each occurrence of text/plain, text/html,
multipart/related, or multipart/alternative in the multipart (unless
they have a 'Content-Disposition: attachment' header) and include all
remaining subparts in the returned iterator. When applied to a
multipart/related, return all parts except the root part. Return an
empty iterator when applied to a multipart/alternative or a
non-multipart.
"""
maintype, subtype = self.get_content_type().split('/')
if maintype != 'multipart' or subtype == 'alternative':
return
parts = self.get_payload()
if maintype == 'multipart' and subtype == 'related':
# For related, we treat everything but the root as an attachment.
# The root may be indicated by 'start'; if there's no start or we
# can't find the named start, treat the first subpart as the root.
start = self.get_param('start')
if start:
found = False
attachments = []
for part in parts:
if part.get('content-id') == start:
found = True
else:
attachments.append(part)
if found:
yield from attachments
return
parts.pop(0)
yield from parts
return
# Otherwise we more or less invert the remaining logic in get_body.
# This only really works in edge cases (ex: non-text relateds or
# alternatives) if the sending agent sets content-disposition.
seen = [] # Only skip the first example of each candidate type.
for part in parts:
maintype, subtype = part.get_content_type().split('/')
if ((maintype, subtype) in self._body_types and
not part.is_attachment and subtype not in seen):
seen.append(subtype)
continue
yield part
def iter_parts(self):
"""Return an iterator over all immediate subparts of a multipart.
Return an empty iterator for a non-multipart.
"""
if self.get_content_maintype() == 'multipart':
yield from self.get_payload()
def get_content(self, *args, content_manager=None, **kw):
if content_manager is None:
content_manager = self.policy.content_manager
return content_manager.get_content(self, *args, **kw)
def set_content(self, *args, content_manager=None, **kw):
if content_manager is None:
content_manager = self.policy.content_manager
content_manager.set_content(self, *args, **kw)
def _make_multipart(self, subtype, disallowed_subtypes, boundary):
if self.get_content_maintype() == 'multipart':
existing_subtype = self.get_content_subtype()
disallowed_subtypes = disallowed_subtypes + (subtype,)
if existing_subtype in disallowed_subtypes:
raise ValueError("Cannot convert {} to {}".format(
existing_subtype, subtype))
keep_headers = []
part_headers = []
for name, value in self._headers:
if name.lower().startswith('content-'):
part_headers.append((name, value))
else:
keep_headers.append((name, value))
if part_headers:
# There is existing content, move it to the first subpart.
part = type(self)(policy=self.policy)
part._headers = part_headers
part._payload = self._payload
self._payload = [part]
else:
self._payload = []
self._headers = keep_headers
self['Content-Type'] = 'multipart/' + subtype
if boundary is not None:
self.set_param('boundary', boundary)
def make_related(self, boundary=None):
self._make_multipart('related', ('alternative', 'mixed'), boundary)
def make_alternative(self, boundary=None):
self._make_multipart('alternative', ('mixed',), boundary)
def make_mixed(self, boundary=None):
self._make_multipart('mixed', (), boundary)
def _add_multipart(self, _subtype, *args, _disp=None, **kw):
if (self.get_content_maintype() != 'multipart' or
self.get_content_subtype() != _subtype):
getattr(self, 'make_' + _subtype)()
part = type(self)(policy=self.policy)
part.set_content(*args, **kw)
if _disp and 'content-disposition' not in part:
part['Content-Disposition'] = _disp
self.attach(part)
def add_related(self, *args, **kw):
self._add_multipart('related', *args, _disp='inline', **kw)
def add_alternative(self, *args, **kw):
self._add_multipart('alternative', *args, **kw)
def add_attachment(self, *args, **kw):
self._add_multipart('mixed', *args, _disp='attachment', **kw)
def clear(self):
self._headers = []
self._payload = None
def clear_content(self):
self._headers = [(n, v) for n, v in self._headers
if not n.lower().startswith('content-')]
self._payload = None
class EmailMessage(MIMEPart):
def set_content(self, *args, **kw):
super().set_content(*args, **kw)
if 'MIME-Version' not in self:
self['MIME-Version'] = '1.0'

View file

@ -5,6 +5,7 @@ code that adds all the email6 features.
from email._policybase import Policy, Compat32, compat32, _extend_docstrings
from email.utils import _has_surrogates
from email.headerregistry import HeaderRegistry as HeaderRegistry
from email.contentmanager import raw_data_manager
__all__ = [
'Compat32',
@ -58,10 +59,22 @@ class EmailPolicy(Policy):
special treatment, while all other fields are
treated as unstructured. This list will be
completed before the extension is marked stable.)
content_manager -- an object with at least two methods: get_content
and set_content. When the get_content or
set_content method of a Message object is called,
it calls the corresponding method of this object,
passing it the message object as its first argument,
and any arguments or keywords that were passed to
it as additional arguments. The default
content_manager is
:data:`~email.contentmanager.raw_data_manager`.
"""
refold_source = 'long'
header_factory = HeaderRegistry()
content_manager = raw_data_manager
def __init__(self, **kw):
# Ensure that each new instance gets a unique header factory

View file

@ -68,9 +68,13 @@ def _has_surrogates(s):
# How to deal with a string containing bytes before handing it to the
# application through the 'normal' interface.
def _sanitize(string):
# Turn any escaped bytes into unicode 'unknown' char.
original_bytes = string.encode('ascii', 'surrogateescape')
return original_bytes.decode('ascii', 'replace')
# Turn any escaped bytes into unicode 'unknown' char. If the escaped
# bytes happen to be utf-8 they will instead get decoded, even if they
# were invalid in the charset the source was supposed to be in. This
# seems like it is not a bad thing; a defect was still registered.
original_bytes = string.encode('utf-8', 'surrogateescape')
return original_bytes.decode('utf-8', 'replace')
# Helpers