parse(), _parseheaders(), _parsebody(): A fix for SF bug #633527,

where in lax parsing, the first non-header line after a header block (e.g. the first line not containing a colon, and not a continuation), can be treated as the first body line, even without the RFC mandated blank line separator. rfc822 had this behavior, and I vaguely remember problems with this, but can't remember details. In any event, all the tests still pass, so I guess we'll find out. ;/ This patch works by returning the non-header, non-continuation line from _parseheader() and using that as the first header line prepended to fp.read() if given. It's usually None. We use this approach instead of trying to seek/tell the file-like object.
2025-11-01 18:51:43 +00:00 · 2002-11-05 21:44:06 +00:00 · 2002-11-05 21:44:06 +00:00 · da2525ed2a
commit da2525ed2a
parent a0a00761a5
1 changed files with 22 additions and 9 deletions
--- a/Lib/email/Parser.py
+++ b/Lib/email/Parser.py
@ -59,9 +59,9 @@ class Parser:
        meaning it parses the entire contents of the file.
        """
        root = self._class()
-        self._parseheaders(root, fp)
+        firstbodyline = self._parseheaders(root, fp)
        if not headersonly:
-            self._parsebody(root, fp)
+            self._parsebody(root, fp, firstbodyline)
        return root
    def parsestr(self, text, headersonly=False):
@ -80,6 +80,7 @@ class Parser:
        lastheader = ''
        lastvalue = []
        lineno = 0
        firstbodyline = None
        while True:
            # Don't strip the line before we test for the end condition,
            # because whitespace-only header lines are RFC compliant
@ -120,13 +121,16 @@ class Parser:
            if i < 0:
                if self._strict:
                    raise Errors.HeaderParseError(
-                        "Not a header, not a continuation: ``%s''"%line)
+                        "Not a header, not a continuation: ``%s''" % line)
                elif lineno == 1 and line.startswith('--'):
                    # allow through duplicate boundary tags.
                    continue
                else:
-                    raise Errors.HeaderParseError(
+                    # There was no separating blank line as mandated by RFC
-                        "Not a header, not a continuation: ``%s''"%line)
+                    # 2822, but we're in non-strict mode.  So just offer up
                    # this current line as the first body line.
                    firstbodyline = line
                    break
            if lastheader:
                container[lastheader] = NL.join(lastvalue)
            lastheader = line[:i]
@ -134,8 +138,9 @@ class Parser:
        # Make sure we retain the last header
        if lastheader:
            container[lastheader] = NL.join(lastvalue)
        return firstbodyline
-    def _parsebody(self, container, fp):
+    def _parsebody(self, container, fp, firstbodyline=None):
        # Parse the body, but first split the payload on the content-type
        # boundary if present.
        boundary = container.get_boundary()
@ -152,6 +157,8 @@ class Parser:
            # boundary.
            separator = '--' + boundary
            payload = fp.read()
            if firstbodyline is not None:
                payload = firstbodyline + '\n' + payload
            # We use an RE here because boundaries can have trailing
            # whitespace.
            mo = re.search(
@ -260,7 +267,10 @@ class Parser:
                self._parsebody(msg, fp)
            container.attach(msg)
        else:
-            container.set_payload(fp.read())
+            text = fp.read()
            if firstbodyline is not None:
                text = firstbodyline + '\n' + text
            container.set_payload(text)
@ -274,6 +284,9 @@ class HeaderParser(Parser):
    Parsing with this subclass can be considerably faster if all you're
    interested in is the message headers.
    """
-    def _parsebody(self, container, fp):
+    def _parsebody(self, container, fp, firstbodyline=None):
        # Consume but do not parse, the body
-        container.set_payload(fp.read())
+        text = fp.read()
        if firstbodyline is not None:
            text = firstbodyline + '\n' + text
        container.set_payload(text)