mirror of
https://github.com/python/cpython.git
synced 2025-07-15 15:25:29 +00:00

<http://sf.net/projects/mimelib>. There /are/ API differences between mimelib and email, but most of the implementations are shared (except where cool Py2.2 stuff like generators are used).
154 lines
6.1 KiB
Python
154 lines
6.1 KiB
Python
# Copyright (C) 2001 Python Software Foundation
|
||
# Author: barry@zope.com (Barry Warsaw)
|
||
|
||
"""A parser of RFC 2822 and MIME email messages.
|
||
"""
|
||
|
||
import re
|
||
from cStringIO import StringIO
|
||
|
||
# Intrapackage imports
|
||
import Errors
|
||
import Message
|
||
|
||
bcre = re.compile('boundary="?([^"]+)"?', re.IGNORECASE)
|
||
EMPTYSTRING = ''
|
||
NL = '\n'
|
||
|
||
|
||
|
||
class Parser:
|
||
def __init__(self, _class=Message.Message):
|
||
"""Parser of RFC 2822 and MIME email messages.
|
||
|
||
Creates an in-memory object tree representing the email message, which
|
||
can then be manipulated and turned over to a Generator to return the
|
||
textual representation of the message.
|
||
|
||
The string must be formatted as a block of RFC 2822 headers and header
|
||
continuation lines, optionally preceeded by a `Unix-from' header. The
|
||
header block is terminated either by the end of the string or by a
|
||
blank line.
|
||
|
||
_class is the class to instantiate for new message objects when they
|
||
must be created. This class must have a constructor that can take
|
||
zero arguments. Default is Message.Message.
|
||
"""
|
||
self._class = _class
|
||
|
||
def parse(self, fp):
|
||
root = self._class()
|
||
self._parseheaders(root, fp)
|
||
self._parsebody(root, fp)
|
||
return root
|
||
|
||
def parsestr(self, text):
|
||
return self.parse(StringIO(text))
|
||
|
||
def _parseheaders(self, container, fp):
|
||
# Parse the headers, returning a list of header/value pairs. None as
|
||
# the header means the Unix-From header.
|
||
lastheader = ''
|
||
lastvalue = []
|
||
lineno = 0
|
||
while 1:
|
||
line = fp.readline()[:-1]
|
||
if not line or not line.strip():
|
||
break
|
||
lineno += 1
|
||
# Check for initial Unix From_ line
|
||
if line.startswith('From '):
|
||
if lineno == 1:
|
||
container.set_unixfrom(line)
|
||
continue
|
||
else:
|
||
raise Errors.HeaderParseError(
|
||
'Unix-from in headers after first rfc822 header')
|
||
#
|
||
# Header continuation line
|
||
if line[0] in ' \t':
|
||
if not lastheader:
|
||
raise Errors.HeaderParseError(
|
||
'Continuation line seen before first header')
|
||
lastvalue.append(line)
|
||
continue
|
||
# Normal, non-continuation header. BAW: this should check to make
|
||
# sure it's a legal header, e.g. doesn't contain spaces. Also, we
|
||
# should expose the header matching algorithm in the API, and
|
||
# allow for a non-strict parsing mode (that ignores the line
|
||
# instead of raising the exception).
|
||
i = line.find(':')
|
||
if i < 0:
|
||
raise Errors.HeaderParseError(
|
||
'Not a header, not a continuation')
|
||
if lastheader:
|
||
container[lastheader] = NL.join(lastvalue)
|
||
lastheader = line[:i]
|
||
lastvalue = [line[i+1:].lstrip()]
|
||
# Make sure we retain the last header
|
||
if lastheader:
|
||
container[lastheader] = NL.join(lastvalue)
|
||
|
||
def _parsebody(self, container, fp):
|
||
# Parse the body, but first split the payload on the content-type
|
||
# boundary if present.
|
||
boundary = isdigest = None
|
||
ctype = container['content-type']
|
||
if ctype:
|
||
mo = bcre.search(ctype)
|
||
if mo:
|
||
boundary = mo.group(1)
|
||
isdigest = container.get_type() == 'multipart/digest'
|
||
# If there's a boundary, split the payload text into its constituent
|
||
# parts and parse each separately. Otherwise, just parse the rest of
|
||
# the body as a single message. Note: any exceptions raised in the
|
||
# recursive parse need to have their line numbers coerced.
|
||
if boundary:
|
||
preamble = epilogue = None
|
||
# Split into subparts. The first boundary we're looking for won't
|
||
# have the leading newline since we're at the start of the body
|
||
# text.
|
||
separator = '--' + boundary
|
||
payload = fp.read()
|
||
start = payload.find(separator)
|
||
if start < 0:
|
||
raise Errors.BoundaryError(
|
||
"Couldn't find starting boundary: %s" % boundary)
|
||
if start > 0:
|
||
# there's some pre-MIME boundary preamble
|
||
preamble = payload[0:start]
|
||
start += len(separator) + 1 + isdigest
|
||
terminator = payload.find('\n' + separator + '--', start)
|
||
if terminator < 0:
|
||
raise Errors.BoundaryError(
|
||
"Couldn't find terminating boundary: %s" % boundary)
|
||
if terminator+len(separator)+3 < len(payload):
|
||
# there's some post-MIME boundary epilogue
|
||
epilogue = payload[terminator+len(separator)+3:]
|
||
# We split the textual payload on the boundary separator, which
|
||
# includes the trailing newline. If the container is a
|
||
# multipart/digest then the subparts are by default message/rfc822
|
||
# instead of text/plain. In that case, they'll have an extra
|
||
# newline before the headers to distinguish the message's headers
|
||
# from the subpart headers.
|
||
if isdigest:
|
||
separator += '\n\n'
|
||
else:
|
||
separator += '\n'
|
||
parts = payload[start:terminator].split('\n' + separator)
|
||
for part in parts:
|
||
msgobj = self.parsestr(part)
|
||
container.preamble = preamble
|
||
container.epilogue = epilogue
|
||
container.add_payload(msgobj)
|
||
elif ctype == 'message/rfc822':
|
||
# Create a container for the payload, but watch out for there not
|
||
# being any headers left
|
||
try:
|
||
msg = self.parse(fp)
|
||
except Errors.HeaderParseError:
|
||
msg = self._class()
|
||
self._parsebody(msg, fp)
|
||
container.add_payload(msg)
|
||
else:
|
||
container.add_payload(fp.read())
|