mirror of
https://github.com/python/cpython.git
synced 2025-10-09 08:31:26 +00:00
#4661: add bytes parsing and generation to email (email version bump to 5.1.0)
The work on this is not 100% complete, but everything is present to allow real-world testing of the code. The only remaining major todo item is to (hopefully!) enhance the handling of non-ASCII bytes in headers converted to unicode by RFC2047 encoding them rather than replacing them with '?'s.
This commit is contained in:
parent
59fdd6736b
commit
96fd54eaec
11 changed files with 708 additions and 85 deletions
|
@ -12,8 +12,9 @@ import time
|
|||
import random
|
||||
import warnings
|
||||
|
||||
from io import StringIO
|
||||
from io import StringIO, BytesIO
|
||||
from email.header import Header
|
||||
from email.message import _has_surrogates
|
||||
|
||||
UNDERSCORE = '_'
|
||||
NL = '\n'
|
||||
|
@ -72,7 +73,7 @@ class Generator:
|
|||
ufrom = msg.get_unixfrom()
|
||||
if not ufrom:
|
||||
ufrom = 'From nobody ' + time.ctime(time.time())
|
||||
print(ufrom, file=self._fp)
|
||||
self.write(ufrom + NL)
|
||||
self._write(msg)
|
||||
|
||||
def clone(self, fp):
|
||||
|
@ -83,6 +84,29 @@ class Generator:
|
|||
# Protected interface - undocumented ;/
|
||||
#
|
||||
|
||||
# Note that we use 'self.write' when what we are writing is coming from
|
||||
# the source, and self._fp.write when what we are writing is coming from a
|
||||
# buffer (because the Bytes subclass has already had a chance to transform
|
||||
# the data in its write method in that case). This is an entirely
|
||||
# pragmatic split determined by experiment; we could be more general by
|
||||
# always using write and having the Bytes subclass write method detect when
|
||||
# it has already transformed the input; but, since this whole thing is a
|
||||
# hack anyway this seems good enough.
|
||||
|
||||
# We use these class constants when we need to manipulate data that has
|
||||
# already been written to a buffer (ex: constructing a re to check the
|
||||
# boundary), and the module level NL constant when adding new output to a
|
||||
# buffer via self.write, because 'write' always takes strings.
|
||||
# Having write always take strings makes the code simpler, but there are
|
||||
# a few occasions when we need to write previously created data back
|
||||
# to the buffer or to a new buffer; for those cases we use self._fp.write.
|
||||
_NL = NL
|
||||
_EMPTY = ''
|
||||
|
||||
def _new_buffer(self):
|
||||
# BytesGenerator overrides this to return BytesIO.
|
||||
return StringIO()
|
||||
|
||||
def _write(self, msg):
|
||||
# We can't write the headers yet because of the following scenario:
|
||||
# say a multipart message includes the boundary string somewhere in
|
||||
|
@ -91,13 +115,13 @@ class Generator:
|
|||
# parameter.
|
||||
#
|
||||
# The way we do this, so as to make the _handle_*() methods simpler,
|
||||
# is to cache any subpart writes into a StringIO. The we write the
|
||||
# headers and the StringIO contents. That way, subpart handlers can
|
||||
# is to cache any subpart writes into a buffer. The we write the
|
||||
# headers and the buffer contents. That way, subpart handlers can
|
||||
# Do The Right Thing, and can still modify the Content-Type: header if
|
||||
# necessary.
|
||||
oldfp = self._fp
|
||||
try:
|
||||
self._fp = sfp = StringIO()
|
||||
self._fp = sfp = self._new_buffer()
|
||||
self._dispatch(msg)
|
||||
finally:
|
||||
self._fp = oldfp
|
||||
|
@ -132,16 +156,16 @@ class Generator:
|
|||
|
||||
def _write_headers(self, msg):
|
||||
for h, v in msg.items():
|
||||
print('%s:' % h, end=' ', file=self._fp)
|
||||
self.write('%s: ' % h)
|
||||
if isinstance(v, Header):
|
||||
print(v.encode(maxlinelen=self._maxheaderlen), file=self._fp)
|
||||
self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
|
||||
else:
|
||||
# Header's got lots of smarts, so use it.
|
||||
header = Header(v, maxlinelen=self._maxheaderlen,
|
||||
header_name=h)
|
||||
print(header.encode(), file=self._fp)
|
||||
self.write(header.encode()+NL)
|
||||
# A blank line always separates headers from body
|
||||
print(file=self._fp)
|
||||
self.write(NL)
|
||||
|
||||
#
|
||||
# Handlers for writing types and subtypes
|
||||
|
@ -153,9 +177,15 @@ class Generator:
|
|||
return
|
||||
if not isinstance(payload, str):
|
||||
raise TypeError('string payload expected: %s' % type(payload))
|
||||
if _has_surrogates(msg._payload):
|
||||
charset = msg.get_param('charset')
|
||||
if charset is not None:
|
||||
del msg['content-transfer-encoding']
|
||||
msg.set_payload(payload, charset)
|
||||
payload = msg.get_payload()
|
||||
if self._mangle_from_:
|
||||
payload = fcre.sub('>From ', payload)
|
||||
self._fp.write(payload)
|
||||
self.write(payload)
|
||||
|
||||
# Default body handler
|
||||
_writeBody = _handle_text
|
||||
|
@ -170,21 +200,21 @@ class Generator:
|
|||
subparts = []
|
||||
elif isinstance(subparts, str):
|
||||
# e.g. a non-strict parse of a message with no starting boundary.
|
||||
self._fp.write(subparts)
|
||||
self.write(subparts)
|
||||
return
|
||||
elif not isinstance(subparts, list):
|
||||
# Scalar payload
|
||||
subparts = [subparts]
|
||||
for part in subparts:
|
||||
s = StringIO()
|
||||
s = self._new_buffer()
|
||||
g = self.clone(s)
|
||||
g.flatten(part, unixfrom=False)
|
||||
msgtexts.append(s.getvalue())
|
||||
# Now make sure the boundary we've selected doesn't appear in any of
|
||||
# the message texts.
|
||||
alltext = NL.join(msgtexts)
|
||||
alltext = self._NL.join(msgtexts)
|
||||
# BAW: What about boundaries that are wrapped in double-quotes?
|
||||
boundary = msg.get_boundary(failobj=_make_boundary(alltext))
|
||||
boundary = msg.get_boundary(failobj=self._make_boundary(alltext))
|
||||
# If we had to calculate a new boundary because the body text
|
||||
# contained that string, set the new boundary. We don't do it
|
||||
# unconditionally because, while set_boundary() preserves order, it
|
||||
|
@ -195,9 +225,9 @@ class Generator:
|
|||
msg.set_boundary(boundary)
|
||||
# If there's a preamble, write it out, with a trailing CRLF
|
||||
if msg.preamble is not None:
|
||||
print(msg.preamble, file=self._fp)
|
||||
self.write(msg.preamble + NL)
|
||||
# dash-boundary transport-padding CRLF
|
||||
print('--' + boundary, file=self._fp)
|
||||
self.write('--' + boundary + NL)
|
||||
# body-part
|
||||
if msgtexts:
|
||||
self._fp.write(msgtexts.pop(0))
|
||||
|
@ -206,14 +236,14 @@ class Generator:
|
|||
# --> CRLF body-part
|
||||
for body_part in msgtexts:
|
||||
# delimiter transport-padding CRLF
|
||||
print('\n--' + boundary, file=self._fp)
|
||||
self.write('\n--' + boundary + NL)
|
||||
# body-part
|
||||
self._fp.write(body_part)
|
||||
# close-delimiter transport-padding
|
||||
self._fp.write('\n--' + boundary + '--')
|
||||
self.write('\n--' + boundary + '--')
|
||||
if msg.epilogue is not None:
|
||||
print(file=self._fp)
|
||||
self._fp.write(msg.epilogue)
|
||||
self.write(NL)
|
||||
self.write(msg.epilogue)
|
||||
|
||||
def _handle_multipart_signed(self, msg):
|
||||
# The contents of signed parts has to stay unmodified in order to keep
|
||||
|
@ -232,23 +262,23 @@ class Generator:
|
|||
# block and the boundary. Sigh.
|
||||
blocks = []
|
||||
for part in msg.get_payload():
|
||||
s = StringIO()
|
||||
s = self._new_buffer()
|
||||
g = self.clone(s)
|
||||
g.flatten(part, unixfrom=False)
|
||||
text = s.getvalue()
|
||||
lines = text.split('\n')
|
||||
lines = text.split(self._NL)
|
||||
# Strip off the unnecessary trailing empty line
|
||||
if lines and lines[-1] == '':
|
||||
blocks.append(NL.join(lines[:-1]))
|
||||
if lines and lines[-1] == self._EMPTY:
|
||||
blocks.append(self._NL.join(lines[:-1]))
|
||||
else:
|
||||
blocks.append(text)
|
||||
# Now join all the blocks with an empty line. This has the lovely
|
||||
# effect of separating each block with an empty line, but not adding
|
||||
# an extra one after the last one.
|
||||
self._fp.write(NL.join(blocks))
|
||||
self._fp.write(self._NL.join(blocks))
|
||||
|
||||
def _handle_message(self, msg):
|
||||
s = StringIO()
|
||||
s = self._new_buffer()
|
||||
g = self.clone(s)
|
||||
# The payload of a message/rfc822 part should be a multipart sequence
|
||||
# of length 1. The zeroth element of the list should be the Message
|
||||
|
@ -265,6 +295,90 @@ class Generator:
|
|||
payload = s.getvalue()
|
||||
self._fp.write(payload)
|
||||
|
||||
# This used to be a module level function; we use a classmethod for this
|
||||
# and _compile_re so we can continue to provide the module level function
|
||||
# for backward compatibility by doing
|
||||
# _make_boudary = Generator._make_boundary
|
||||
# at the end of the module. It *is* internal, so we could drop that...
|
||||
@classmethod
|
||||
def _make_boundary(cls, text=None):
|
||||
# Craft a random boundary. If text is given, ensure that the chosen
|
||||
# boundary doesn't appear in the text.
|
||||
token = random.randrange(sys.maxsize)
|
||||
boundary = ('=' * 15) + (_fmt % token) + '=='
|
||||
if text is None:
|
||||
return boundary
|
||||
b = boundary
|
||||
counter = 0
|
||||
while True:
|
||||
cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
|
||||
if not cre.search(text):
|
||||
break
|
||||
b = boundary + '.' + str(counter)
|
||||
counter += 1
|
||||
return b
|
||||
|
||||
@classmethod
|
||||
def _compile_re(cls, s, flags):
|
||||
return re.compile(s, flags)
|
||||
|
||||
|
||||
class BytesGenerator(Generator):
|
||||
"""Generates a bytes version of a Message object tree.
|
||||
|
||||
Functionally identical to the base Generator except that the output is
|
||||
bytes and not string. When surrogates were used in the input to encode
|
||||
bytes, these are decoded back to bytes for output.
|
||||
|
||||
The outfp object must accept bytes in its write method.
|
||||
"""
|
||||
|
||||
# Bytes versions of these constants for use in manipulating data from
|
||||
# the BytesIO buffer.
|
||||
_NL = NL.encode('ascii')
|
||||
_EMPTY = b''
|
||||
|
||||
def write(self, s):
|
||||
self._fp.write(s.encode('ascii', 'surrogateescape'))
|
||||
|
||||
def _new_buffer(self):
|
||||
return BytesIO()
|
||||
|
||||
def _write_headers(self, msg):
|
||||
# This is almost the same as the string version, except for handling
|
||||
# strings with 8bit bytes.
|
||||
for h, v in msg._headers:
|
||||
self.write('%s: ' % h)
|
||||
if isinstance(v, Header):
|
||||
self.write(v.encode(maxlinelen=self._maxheaderlen)+NL)
|
||||
elif _has_surrogates(v):
|
||||
# If we have raw 8bit data in a byte string, we have no idea
|
||||
# what the encoding is. There is no safe way to split this
|
||||
# string. If it's ascii-subset, then we could do a normal
|
||||
# ascii split, but if it's multibyte then we could break the
|
||||
# string. There's no way to know so the least harm seems to
|
||||
# be to not split the string and risk it being too long.
|
||||
self.write(v+NL)
|
||||
else:
|
||||
# Header's got lots of smarts and this string is safe...
|
||||
header = Header(v, maxlinelen=self._maxheaderlen,
|
||||
header_name=h)
|
||||
self.write(header.encode()+NL)
|
||||
# A blank line always separates headers from body
|
||||
self.write(NL)
|
||||
|
||||
def _handle_text(self, msg):
|
||||
# If the string has surrogates the original source was bytes, so
|
||||
# just write it back out.
|
||||
if _has_surrogates(msg._payload):
|
||||
self.write(msg._payload)
|
||||
else:
|
||||
super(BytesGenerator,self)._handle_text(msg)
|
||||
|
||||
@classmethod
|
||||
def _compile_re(cls, s, flags):
|
||||
return re.compile(s.encode('ascii'), flags)
|
||||
|
||||
|
||||
|
||||
_FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]'
|
||||
|
@ -325,23 +439,9 @@ class DecodedGenerator(Generator):
|
|||
|
||||
|
||||
|
||||
# Helper
|
||||
# Helper used by Generator._make_boundary
|
||||
_width = len(repr(sys.maxsize-1))
|
||||
_fmt = '%%0%dd' % _width
|
||||
|
||||
def _make_boundary(text=None):
|
||||
# Craft a random boundary. If text is given, ensure that the chosen
|
||||
# boundary doesn't appear in the text.
|
||||
token = random.randrange(sys.maxsize)
|
||||
boundary = ('=' * 15) + (_fmt % token) + '=='
|
||||
if text is None:
|
||||
return boundary
|
||||
b = boundary
|
||||
counter = 0
|
||||
while True:
|
||||
cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
|
||||
if not cre.search(text):
|
||||
break
|
||||
b = boundary + '.' + str(counter)
|
||||
counter += 1
|
||||
return b
|
||||
# Backward compatibility
|
||||
_make_boundary = Generator._make_boundary
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue