#4661: add bytes parsing and generation to email (email version bump to 5.1.0)

The work on this is not 100% complete, but everything is present to
allow real-world testing of the code.  The only remaining major todo
item is to (hopefully!) enhance the handling of non-ASCII bytes in headers
converted to unicode by RFC2047 encoding them rather than replacing them with
'?'s.
This commit is contained in:
R. David Murray 2010-10-08 15:55:28 +00:00
parent 59fdd6736b
commit 96fd54eaec
11 changed files with 708 additions and 85 deletions

View file

@ -12,8 +12,9 @@ import time
import random
import warnings
from io import StringIO
from io import StringIO, BytesIO
from email.header import Header
from email.message import _has_surrogates
UNDERSCORE = '_'
NL = '\n'
@ -72,7 +73,7 @@ class Generator:
ufrom = msg.get_unixfrom()
if not ufrom:
ufrom = 'From nobody ' + time.ctime(time.time())
print(ufrom, file=self._fp)
self.write(ufrom + NL)
self._write(msg)
def clone(self, fp):
@ -83,6 +84,29 @@ class Generator:
# Protected interface - undocumented ;/
#
# Note that we use 'self.write' when what we are writing is coming from
# the source, and self._fp.write when what we are writing is coming from a
# buffer (because the Bytes subclass has already had a chance to transform
# the data in its write method in that case). This is an entirely
# pragmatic split determined by experiment; we could be more general by
# always using write and having the Bytes subclass write method detect when
# it has already transformed the input; but, since this whole thing is a
# hack anyway this seems good enough.
# We use these class constants when we need to manipulate data that has
# already been written to a buffer (ex: constructing a re to check the
# boundary), and the module level NL constant when adding new output to a
# buffer via self.write, because 'write' always takes strings.
# Having write always take strings makes the code simpler, but there are
# a few occasions when we need to write previously created data back
# to the buffer or to a new buffer; for those cases we use self._fp.write.
_NL = NL
_EMPTY = ''
def _new_buffer(self):
# BytesGenerator overrides this to return BytesIO.
return StringIO()
def _write(self, msg):
# We can't write the headers yet because of the following scenario:
# say a multipart message includes the boundary string somewhere in
@ -91,13 +115,13 @@ class Generator:
# parameter.
#
# The way we do this, so as to make the _handle_*() methods simpler,
# is to cache any subpart writes into a StringIO. The we write the
# headers and the StringIO contents. That way, subpart handlers can
# is to cache any subpart writes into a buffer. The we write the
# headers and the buffer contents. That way, subpart handlers can
# Do The Right Thing, and can still modify the Content-Type: header if
# necessary.
oldfp = self._fp
try:
self._fp = sfp = StringIO()
self._fp = sfp = self._new_buffer()
self._dispatch(msg)
finally:
self._fp = oldfp
@ -132,16 +156,16 @@ class Generator:
def _write_headers(self, msg):
for h, v in msg.items():
print('%s:' % h, end=' ', file=self._fp)
self.write('%s: ' % h)
if isinstance(v, Header):
print(v.encode(maxlinelen=self._maxheaderlen), file=self._fp)
self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
else:
# Header's got lots of smarts, so use it.
header = Header(v, maxlinelen=self._maxheaderlen,
header_name=h)
print(header.encode(), file=self._fp)
self.write(header.encode()+NL)
# A blank line always separates headers from body
print(file=self._fp)
self.write(NL)
#
# Handlers for writing types and subtypes
@ -153,9 +177,15 @@ class Generator:
return
if not isinstance(payload, str):
raise TypeError('string payload expected: %s' % type(payload))
if _has_surrogates(msg._payload):
charset = msg.get_param('charset')
if charset is not None:
del msg['content-transfer-encoding']
msg.set_payload(payload, charset)
payload = msg.get_payload()
if self._mangle_from_:
payload = fcre.sub('>From ', payload)
self._fp.write(payload)
self.write(payload)
# Default body handler
_writeBody = _handle_text
@ -170,21 +200,21 @@ class Generator:
subparts = []
elif isinstance(subparts, str):
# e.g. a non-strict parse of a message with no starting boundary.
self._fp.write(subparts)
self.write(subparts)
return
elif not isinstance(subparts, list):
# Scalar payload
subparts = [subparts]
for part in subparts:
s = StringIO()
s = self._new_buffer()
g = self.clone(s)
g.flatten(part, unixfrom=False)
msgtexts.append(s.getvalue())
# Now make sure the boundary we've selected doesn't appear in any of
# the message texts.
alltext = NL.join(msgtexts)
alltext = self._NL.join(msgtexts)
# BAW: What about boundaries that are wrapped in double-quotes?
boundary = msg.get_boundary(failobj=_make_boundary(alltext))
boundary = msg.get_boundary(failobj=self._make_boundary(alltext))
# If we had to calculate a new boundary because the body text
# contained that string, set the new boundary. We don't do it
# unconditionally because, while set_boundary() preserves order, it
@ -195,9 +225,9 @@ class Generator:
msg.set_boundary(boundary)
# If there's a preamble, write it out, with a trailing CRLF
if msg.preamble is not None:
print(msg.preamble, file=self._fp)
self.write(msg.preamble + NL)
# dash-boundary transport-padding CRLF
print('--' + boundary, file=self._fp)
self.write('--' + boundary + NL)
# body-part
if msgtexts:
self._fp.write(msgtexts.pop(0))
@ -206,14 +236,14 @@ class Generator:
# --> CRLF body-part
for body_part in msgtexts:
# delimiter transport-padding CRLF
print('\n--' + boundary, file=self._fp)
self.write('\n--' + boundary + NL)
# body-part
self._fp.write(body_part)
# close-delimiter transport-padding
self._fp.write('\n--' + boundary + '--')
self.write('\n--' + boundary + '--')
if msg.epilogue is not None:
print(file=self._fp)
self._fp.write(msg.epilogue)
self.write(NL)
self.write(msg.epilogue)
def _handle_multipart_signed(self, msg):
# The contents of signed parts has to stay unmodified in order to keep
@ -232,23 +262,23 @@ class Generator:
# block and the boundary. Sigh.
blocks = []
for part in msg.get_payload():
s = StringIO()
s = self._new_buffer()
g = self.clone(s)
g.flatten(part, unixfrom=False)
text = s.getvalue()
lines = text.split('\n')
lines = text.split(self._NL)
# Strip off the unnecessary trailing empty line
if lines and lines[-1] == '':
blocks.append(NL.join(lines[:-1]))
if lines and lines[-1] == self._EMPTY:
blocks.append(self._NL.join(lines[:-1]))
else:
blocks.append(text)
# Now join all the blocks with an empty line. This has the lovely
# effect of separating each block with an empty line, but not adding
# an extra one after the last one.
self._fp.write(NL.join(blocks))
self._fp.write(self._NL.join(blocks))
def _handle_message(self, msg):
s = StringIO()
s = self._new_buffer()
g = self.clone(s)
# The payload of a message/rfc822 part should be a multipart sequence
# of length 1. The zeroth element of the list should be the Message
@ -265,6 +295,90 @@ class Generator:
payload = s.getvalue()
self._fp.write(payload)
# This used to be a module level function; we use a classmethod for this
# and _compile_re so we can continue to provide the module level function
# for backward compatibility by doing
# _make_boudary = Generator._make_boundary
# at the end of the module. It *is* internal, so we could drop that...
@classmethod
def _make_boundary(cls, text=None):
# Craft a random boundary. If text is given, ensure that the chosen
# boundary doesn't appear in the text.
token = random.randrange(sys.maxsize)
boundary = ('=' * 15) + (_fmt % token) + '=='
if text is None:
return boundary
b = boundary
counter = 0
while True:
cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
if not cre.search(text):
break
b = boundary + '.' + str(counter)
counter += 1
return b
@classmethod
def _compile_re(cls, s, flags):
return re.compile(s, flags)
class BytesGenerator(Generator):
"""Generates a bytes version of a Message object tree.
Functionally identical to the base Generator except that the output is
bytes and not string. When surrogates were used in the input to encode
bytes, these are decoded back to bytes for output.
The outfp object must accept bytes in its write method.
"""
# Bytes versions of these constants for use in manipulating data from
# the BytesIO buffer.
_NL = NL.encode('ascii')
_EMPTY = b''
def write(self, s):
self._fp.write(s.encode('ascii', 'surrogateescape'))
def _new_buffer(self):
return BytesIO()
def _write_headers(self, msg):
# This is almost the same as the string version, except for handling
# strings with 8bit bytes.
for h, v in msg._headers:
self.write('%s: ' % h)
if isinstance(v, Header):
self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
elif _has_surrogates(v):
# If we have raw 8bit data in a byte string, we have no idea
# what the encoding is. There is no safe way to split this
# string. If it's ascii-subset, then we could do a normal
# ascii split, but if it's multibyte then we could break the
# string. There's no way to know so the least harm seems to
# be to not split the string and risk it being too long.
self.write(v+NL)
else:
# Header's got lots of smarts and this string is safe...
header = Header(v, maxlinelen=self._maxheaderlen,
header_name=h)
self.write(header.encode()+NL)
# A blank line always separates headers from body
self.write(NL)
def _handle_text(self, msg):
# If the string has surrogates the original source was bytes, so
# just write it back out.
if _has_surrogates(msg._payload):
self.write(msg._payload)
else:
super(BytesGenerator,self)._handle_text(msg)
@classmethod
def _compile_re(cls, s, flags):
return re.compile(s.encode('ascii'), flags)
_FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]'
@ -325,23 +439,9 @@ class DecodedGenerator(Generator):
# Helper
# Helper used by Generator._make_boundary
_width = len(repr(sys.maxsize-1))
_fmt = '%%0%dd' % _width
def _make_boundary(text=None):
# Craft a random boundary. If text is given, ensure that the chosen
# boundary doesn't appear in the text.
token = random.randrange(sys.maxsize)
boundary = ('=' * 15) + (_fmt % token) + '=='
if text is None:
return boundary
b = boundary
counter = 0
while True:
cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
if not cre.search(text):
break
b = boundary + '.' + str(counter)
counter += 1
return b
# Backward compatibility
_make_boundary = Generator._make_boundary