mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Anthony Baxter's patch for non-strict parsing. This adds a `strict'
argument to the constructor -- defaulting to true -- which is different than Anthony's approach of using global state. parse(), parsestr(): Grow a `headersonly' argument which stops parsing once the header block has been seen, i.e. it does /not/ parse or even read the body of the message. This is used for parsing message/rfc822 type messages. We need test cases for the non-strict parsing. Anthony will supply these. _parsebody(): We can get rid of the isdigest end-of-line kludges, although we still need to know if we're parsing a multipart/digest so we can set the default type accordingly.
This commit is contained in:
parent
a0c8b9d4d5
commit
f6caeba03a
1 changed files with 71 additions and 24 deletions
|
@ -14,10 +14,9 @@ from email import Message
|
||||||
EMPTYSTRING = ''
|
EMPTYSTRING = ''
|
||||||
NL = '\n'
|
NL = '\n'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Parser:
|
class Parser:
|
||||||
def __init__(self, _class=Message.Message):
|
def __init__(self, _class=Message.Message, strict=1):
|
||||||
"""Parser of RFC 2822 and MIME email messages.
|
"""Parser of RFC 2822 and MIME email messages.
|
||||||
|
|
||||||
Creates an in-memory object tree representing the email message, which
|
Creates an in-memory object tree representing the email message, which
|
||||||
|
@ -32,17 +31,25 @@ class Parser:
|
||||||
_class is the class to instantiate for new message objects when they
|
_class is the class to instantiate for new message objects when they
|
||||||
must be created. This class must have a constructor that can take
|
must be created. This class must have a constructor that can take
|
||||||
zero arguments. Default is Message.Message.
|
zero arguments. Default is Message.Message.
|
||||||
|
|
||||||
|
Optional strict tells the parser to be strictly RFC compliant or to be
|
||||||
|
more forgiving in parsing of ill-formatted MIME documents. When
|
||||||
|
non-strict mode is used, the parser will try to make up for missing or
|
||||||
|
erroneous boundaries and other peculiarities seen in the wild.
|
||||||
|
Defaults to strict parsing.
|
||||||
"""
|
"""
|
||||||
self._class = _class
|
self._class = _class
|
||||||
|
self._strict = strict
|
||||||
|
|
||||||
def parse(self, fp):
|
def parse(self, fp, headersonly=0):
|
||||||
root = self._class()
|
root = self._class()
|
||||||
self._parseheaders(root, fp)
|
self._parseheaders(root, fp)
|
||||||
self._parsebody(root, fp)
|
if not headersonly:
|
||||||
|
self._parsebody(root, fp)
|
||||||
return root
|
return root
|
||||||
|
|
||||||
def parsestr(self, text):
|
def parsestr(self, text, headersonly=0):
|
||||||
return self.parse(StringIO(text))
|
return self.parse(StringIO(text), headersonly=headersonly)
|
||||||
|
|
||||||
def _parseheaders(self, container, fp):
|
def _parseheaders(self, container, fp):
|
||||||
# Parse the headers, returning a list of header/value pairs. None as
|
# Parse the headers, returning a list of header/value pairs. None as
|
||||||
|
@ -67,9 +74,13 @@ class Parser:
|
||||||
if lineno == 1:
|
if lineno == 1:
|
||||||
container.set_unixfrom(line)
|
container.set_unixfrom(line)
|
||||||
continue
|
continue
|
||||||
else:
|
elif self._strict:
|
||||||
raise Errors.HeaderParseError(
|
raise Errors.HeaderParseError(
|
||||||
'Unix-from in headers after first rfc822 header')
|
'Unix-from in headers after first rfc822 header')
|
||||||
|
else:
|
||||||
|
# ignore the wierdly placed From_ line
|
||||||
|
# XXX: maybe set unixfrom anyway? or only if not already?
|
||||||
|
continue
|
||||||
# Header continuation line
|
# Header continuation line
|
||||||
if line[0] in ' \t':
|
if line[0] in ' \t':
|
||||||
if not lastheader:
|
if not lastheader:
|
||||||
|
@ -84,8 +95,15 @@ class Parser:
|
||||||
# instead of raising the exception).
|
# instead of raising the exception).
|
||||||
i = line.find(':')
|
i = line.find(':')
|
||||||
if i < 0:
|
if i < 0:
|
||||||
raise Errors.HeaderParseError(
|
if self._strict:
|
||||||
'Not a header, not a continuation')
|
raise Errors.HeaderParseError(
|
||||||
|
"Not a header, not a continuation: ``%s''"%line)
|
||||||
|
elif lineno == 1 and line.startswith('--'):
|
||||||
|
# allow through duplicate boundary tags.
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise Errors.HeaderParseError(
|
||||||
|
"Not a header, not a continuation: ``%s''"%line)
|
||||||
if lastheader:
|
if lastheader:
|
||||||
container[lastheader] = NL.join(lastvalue)
|
container[lastheader] = NL.join(lastvalue)
|
||||||
lastheader = line[:i]
|
lastheader = line[:i]
|
||||||
|
@ -122,31 +140,60 @@ class Parser:
|
||||||
cre = re.compile('\r\n|\r|\n')
|
cre = re.compile('\r\n|\r|\n')
|
||||||
mo = cre.search(payload, start)
|
mo = cre.search(payload, start)
|
||||||
if mo:
|
if mo:
|
||||||
start += len(mo.group(0)) * (1 + isdigest)
|
start += len(mo.group(0))
|
||||||
# We create a compiled regexp first because we need to be able to
|
# We create a compiled regexp first because we need to be able to
|
||||||
# specify the start position, and the module function doesn't
|
# specify the start position, and the module function doesn't
|
||||||
# support this signature. :(
|
# support this signature. :(
|
||||||
cre = re.compile('(?P<sep>\r\n|\r|\n)' +
|
cre = re.compile('(?P<sep>\r\n|\r|\n)' +
|
||||||
re.escape(separator) + '--')
|
re.escape(separator) + '--')
|
||||||
mo = cre.search(payload, start)
|
mo = cre.search(payload, start)
|
||||||
if not mo:
|
if mo:
|
||||||
|
terminator = mo.start()
|
||||||
|
linesep = mo.group('sep')
|
||||||
|
if mo.end() < len(payload):
|
||||||
|
# there's some post-MIME boundary epilogue
|
||||||
|
epilogue = payload[mo.end():]
|
||||||
|
elif self._strict:
|
||||||
raise Errors.BoundaryError(
|
raise Errors.BoundaryError(
|
||||||
"Couldn't find terminating boundary: %s" % boundary)
|
"Couldn't find terminating boundary: %s" % boundary)
|
||||||
terminator = mo.start()
|
else:
|
||||||
linesep = mo.group('sep')
|
# handle the case of no trailing boundary. I hate mail clients.
|
||||||
if mo.end() < len(payload):
|
# check that it ends in a blank line
|
||||||
# there's some post-MIME boundary epilogue
|
endre = re.compile('(?P<sep>\r\n|\r|\n){2}$')
|
||||||
epilogue = payload[mo.end():]
|
mo = endre.search(payload)
|
||||||
|
if not mo:
|
||||||
|
raise Errors.BoundaryError(
|
||||||
|
"Couldn't find terminating boundary, and no "+
|
||||||
|
"trailing empty line")
|
||||||
|
else:
|
||||||
|
linesep = mo.group('sep')
|
||||||
|
terminator = len(payload)
|
||||||
# We split the textual payload on the boundary separator, which
|
# We split the textual payload on the boundary separator, which
|
||||||
# includes the trailing newline. If the container is a
|
# includes the trailing newline. If the container is a
|
||||||
# multipart/digest then the subparts are by default message/rfc822
|
# multipart/digest then the subparts are by default message/rfc822
|
||||||
# instead of text/plain. In that case, they'll have an extra
|
# instead of text/plain. In that case, they'll have a optional
|
||||||
# newline before the headers to distinguish the message's headers
|
# block of MIME headers, then an empty line followed by the
|
||||||
# from the subpart headers.
|
# message headers.
|
||||||
separator += linesep * (1 + isdigest)
|
separator += linesep
|
||||||
parts = payload[start:terminator].split(linesep + separator)
|
parts = payload[start:terminator].split(linesep + separator)
|
||||||
for part in parts:
|
for part in parts:
|
||||||
msgobj = self.parsestr(part)
|
if isdigest:
|
||||||
|
if part[0] == linesep:
|
||||||
|
# There's no header block so create an empty message
|
||||||
|
# object as the container, and lop off the newline so
|
||||||
|
# we can parse the sub-subobject
|
||||||
|
msgobj = self._class()
|
||||||
|
part = part[1:]
|
||||||
|
else:
|
||||||
|
parthdrs, part = part.split(linesep+linesep, 1)
|
||||||
|
# msgobj in this case is the "message/rfc822" container
|
||||||
|
msgobj = self.parsestr(parthdrs, headersonly=1)
|
||||||
|
# while submsgobj is the message itself
|
||||||
|
submsgobj = self.parsestr(part)
|
||||||
|
msgobj.attach(submsgobj)
|
||||||
|
msgobj.set_default_type('message/rfc822')
|
||||||
|
else:
|
||||||
|
msgobj = self.parsestr(part)
|
||||||
container.preamble = preamble
|
container.preamble = preamble
|
||||||
container.epilogue = epilogue
|
container.epilogue = epilogue
|
||||||
container.attach(msgobj)
|
container.attach(msgobj)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue