parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527,

where in lax parsing, the first non-header line after a header block
(e.g. the first line not containing a colon, and not a continuation),
can be treated as the first body line, even without the RFC mandated
blank line separator.

rfc822 had this behavior, and I vaguely remember problems with this,
but can't remember details.  In any event, all the tests still pass,
so I guess we'll find out. ;/

This patch works by returning the non-header, non-continuation line
from _parseheader() and using that as the first header line prepended
to fp.read() if given.  It's usually None.

We use this approach instead of trying to seek/tell the file-like
object.
This commit is contained in:
Barry Warsaw 2002-11-05 21:44:06 +00:00
parent a0a00761a5
commit da2525ed2a

View file

@ -59,9 +59,9 @@ class Parser:
meaning it parses the entire contents of the file. meaning it parses the entire contents of the file.
""" """
root = self._class() root = self._class()
self._parseheaders(root, fp) firstbodyline = self._parseheaders(root, fp)
if not headersonly: if not headersonly:
self._parsebody(root, fp) self._parsebody(root, fp, firstbodyline)
return root return root
def parsestr(self, text, headersonly=False): def parsestr(self, text, headersonly=False):
@ -80,6 +80,7 @@ class Parser:
lastheader = '' lastheader = ''
lastvalue = [] lastvalue = []
lineno = 0 lineno = 0
firstbodyline = None
while True: while True:
# Don't strip the line before we test for the end condition, # Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant # because whitespace-only header lines are RFC compliant
@ -120,13 +121,16 @@ class Parser:
if i < 0: if i < 0:
if self._strict: if self._strict:
raise Errors.HeaderParseError( raise Errors.HeaderParseError(
"Not a header, not a continuation: ``%s''"%line) "Not a header, not a continuation: ``%s''" % line)
elif lineno == 1 and line.startswith('--'): elif lineno == 1 and line.startswith('--'):
# allow through duplicate boundary tags. # allow through duplicate boundary tags.
continue continue
else: else:
raise Errors.HeaderParseError( # There was no separating blank line as mandated by RFC
"Not a header, not a continuation: ``%s''"%line) # 2822, but we're in non-strict mode. So just offer up
# this current line as the first body line.
firstbodyline = line
break
if lastheader: if lastheader:
container[lastheader] = NL.join(lastvalue) container[lastheader] = NL.join(lastvalue)
lastheader = line[:i] lastheader = line[:i]
@ -134,8 +138,9 @@ class Parser:
# Make sure we retain the last header # Make sure we retain the last header
if lastheader: if lastheader:
container[lastheader] = NL.join(lastvalue) container[lastheader] = NL.join(lastvalue)
return firstbodyline
def _parsebody(self, container, fp): def _parsebody(self, container, fp, firstbodyline=None):
# Parse the body, but first split the payload on the content-type # Parse the body, but first split the payload on the content-type
# boundary if present. # boundary if present.
boundary = container.get_boundary() boundary = container.get_boundary()
@ -152,6 +157,8 @@ class Parser:
# boundary. # boundary.
separator = '--' + boundary separator = '--' + boundary
payload = fp.read() payload = fp.read()
if firstbodyline is not None:
payload = firstbodyline + '\n' + payload
# We use an RE here because boundaries can have trailing # We use an RE here because boundaries can have trailing
# whitespace. # whitespace.
mo = re.search( mo = re.search(
@ -260,7 +267,10 @@ class Parser:
self._parsebody(msg, fp) self._parsebody(msg, fp)
container.attach(msg) container.attach(msg)
else: else:
container.set_payload(fp.read()) text = fp.read()
if firstbodyline is not None:
text = firstbodyline + '\n' + text
container.set_payload(text)
@ -274,6 +284,9 @@ class HeaderParser(Parser):
Parsing with this subclass can be considerably faster if all you're Parsing with this subclass can be considerably faster if all you're
interested in is the message headers. interested in is the message headers.
""" """
def _parsebody(self, container, fp): def _parsebody(self, container, fp, firstbodyline=None):
# Consume but do not parse, the body # Consume but do not parse, the body
container.set_payload(fp.read()) text = fp.read()
if firstbodyline is not None:
text = firstbodyline + '\n' + text
container.set_payload(text)