mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 11:49:12 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			362 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			362 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# A new Feed-style Parser
 | 
						|
 | 
						|
from email import Errors, Message
 | 
						|
import re
 | 
						|
 | 
						|
NLCRE = re.compile('\r\n|\r|\n')
 | 
						|
 | 
						|
EMPTYSTRING = ''
 | 
						|
NL = '\n'
 | 
						|
 | 
						|
NeedMoreData = object()
 | 
						|
 | 
						|
class FeedableLumpOfText:
 | 
						|
    "A file-like object that can have new data loaded into it"
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        self._partial = ''
 | 
						|
        self._done = False
 | 
						|
        # _pending is a list of lines, in reverse order
 | 
						|
        self._pending = []
 | 
						|
 | 
						|
    def readline(self):
 | 
						|
        """ Return a line of data.
 | 
						|
 | 
						|
            If data has been pushed back with unreadline(), the most recently
 | 
						|
            returned unreadline()d data will be returned.
 | 
						|
        """
 | 
						|
        if not self._pending:
 | 
						|
            if self._done:
 | 
						|
                return ''
 | 
						|
            return NeedMoreData
 | 
						|
        return self._pending.pop()
 | 
						|
 | 
						|
    def unreadline(self, line):
 | 
						|
        """ Push a line back into the object. 
 | 
						|
        """
 | 
						|
        self._pending.append(line)
 | 
						|
 | 
						|
    def peekline(self):
 | 
						|
        """ Non-destructively look at the next line """
 | 
						|
        if not self._pending:
 | 
						|
            if self._done:
 | 
						|
                return ''
 | 
						|
            return NeedMoreData
 | 
						|
        return self._pending[-1]
 | 
						|
 | 
						|
 | 
						|
    # for r in self._input.readuntil(regexp):
 | 
						|
    #     if r is NeedMoreData:
 | 
						|
    #         yield NeedMoreData
 | 
						|
    #     preamble, matchobj = r
 | 
						|
    def readuntil(self, matchre, afterblank=False, includematch=False):
 | 
						|
        """ Read a line at a time until we get the specified RE. 
 | 
						|
 | 
						|
            Returns the text up to (and including, if includematch is true) the 
 | 
						|
            matched text, and the RE match object. If afterblank is true, 
 | 
						|
            there must be a blank line before the matched text. Moves current 
 | 
						|
            filepointer to the line following the matched line. If we reach 
 | 
						|
            end-of-file, return what we've got so far, and return None as the
 | 
						|
            RE match object.
 | 
						|
        """
 | 
						|
        prematch = []
 | 
						|
        blankseen = 0
 | 
						|
        while 1: 
 | 
						|
            if not self._pending:
 | 
						|
                if self._done:
 | 
						|
                    # end of file
 | 
						|
                    yield EMPTYSTRING.join(prematch), None
 | 
						|
                else:
 | 
						|
                    yield NeedMoreData
 | 
						|
                continue
 | 
						|
            line = self._pending.pop()
 | 
						|
            if afterblank:
 | 
						|
                if NLCRE.match(line):
 | 
						|
                    blankseen = 1
 | 
						|
                    continue
 | 
						|
                else:
 | 
						|
                    blankseen = 0
 | 
						|
            m = matchre.match(line)
 | 
						|
            if (m and not afterblank) or (m and afterblank and blankseen):
 | 
						|
                if includematch:
 | 
						|
                    prematch.append(line)
 | 
						|
                yield EMPTYSTRING.join(prematch), m
 | 
						|
            prematch.append(line)
 | 
						|
 | 
						|
 | 
						|
    NLatend = re.compile('(\r\n|\r|\n)$').match
 | 
						|
    NLCRE_crack = re.compile('(\r\n|\r|\n)')
 | 
						|
 | 
						|
    def push(self, data):
 | 
						|
        """ Push some new data into this object """
 | 
						|
        # Handle any previous leftovers
 | 
						|
        data, self._partial = self._partial+data, ''
 | 
						|
        # Crack into lines, but leave the newlines on the end of each
 | 
						|
        lines = self.NLCRE_crack.split(data)
 | 
						|
        # The *ahem* interesting behaviour of re.split when supplied
 | 
						|
        # groups means that the last element is the data after the 
 | 
						|
        # final RE. In the case of a NL/CR terminated string, this is
 | 
						|
        # the empty string.
 | 
						|
        self._partial = lines.pop()
 | 
						|
        o = []
 | 
						|
        for i in range(len(lines) / 2):
 | 
						|
            o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]]))
 | 
						|
        self.pushlines(o)
 | 
						|
    
 | 
						|
    def pushlines(self, lines):
 | 
						|
        """ Push a list of new lines into the object """
 | 
						|
        # Reverse and insert at the front of _pending
 | 
						|
        self._pending[:0] = lines[::-1]
 | 
						|
 | 
						|
    def end(self):
 | 
						|
        """ There is no more data """
 | 
						|
        self._done = True
 | 
						|
 | 
						|
    def is_done(self):
 | 
						|
        return self._done
 | 
						|
 | 
						|
    def __iter__(self):
 | 
						|
        return self
 | 
						|
 | 
						|
    def next(self):
 | 
						|
        l = self.readline()
 | 
						|
        if l == '': 
 | 
						|
            raise StopIteration
 | 
						|
        return l
 | 
						|
 | 
						|
class FeedParser:
 | 
						|
    "A feed-style parser of email. copy docstring here"
 | 
						|
 | 
						|
    def __init__(self, _class=Message.Message):
 | 
						|
        "fnord fnord fnord"
 | 
						|
        self._class = _class
 | 
						|
        self._input = FeedableLumpOfText()
 | 
						|
        self._root = None
 | 
						|
        self._objectstack = []
 | 
						|
        self._parse = self._parsegen().next
 | 
						|
 | 
						|
    def end(self):
 | 
						|
        self._input.end()
 | 
						|
        self._call_parse()
 | 
						|
        return self._root
 | 
						|
 | 
						|
    def feed(self, data):
 | 
						|
        self._input.push(data)
 | 
						|
        self._call_parse()
 | 
						|
 | 
						|
    def _call_parse(self):
 | 
						|
        try:
 | 
						|
            self._parse()
 | 
						|
        except StopIteration:
 | 
						|
            pass
 | 
						|
 | 
						|
    headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
 | 
						|
 | 
						|
    def _parse_headers(self,headerlist):
 | 
						|
        # Passed a list of strings that are the headers for the 
 | 
						|
        # current object
 | 
						|
        lastheader = ''
 | 
						|
        lastvalue = []
 | 
						|
 | 
						|
 | 
						|
        for lineno, line in enumerate(headerlist):
 | 
						|
            # Check for continuation
 | 
						|
            if line[0] in ' \t':
 | 
						|
                if not lastheader:
 | 
						|
                    raise Errors.HeaderParseError('First line must not be a continuation')
 | 
						|
                lastvalue.append(line)
 | 
						|
                continue
 | 
						|
 | 
						|
            if lastheader:
 | 
						|
                # XXX reconsider the joining of folded lines
 | 
						|
                self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
 | 
						|
                lastheader, lastvalue = '', []
 | 
						|
 | 
						|
            # Check for Unix-From
 | 
						|
            if line.startswith('From '):
 | 
						|
                if lineno == 0:
 | 
						|
                    self._cur.set_unixfrom(line)
 | 
						|
                    continue
 | 
						|
                elif lineno == len(headerlist) - 1:
 | 
						|
                    # Something looking like a unix-from at the end - it's
 | 
						|
                    # probably the first line of the body
 | 
						|
                    self._input.unreadline(line)
 | 
						|
                    return
 | 
						|
                else:
 | 
						|
                    # Weirdly placed unix-from line. Ignore it.
 | 
						|
                    continue
 | 
						|
 | 
						|
            i = line.find(':')
 | 
						|
            if i < 0:
 | 
						|
                # The older parser had various special-cases here. We've
 | 
						|
                # already handled them
 | 
						|
                raise Errors.HeaderParseError(
 | 
						|
                       "Not a header, not a continuation: ``%s''" % line)
 | 
						|
            lastheader = line[:i]
 | 
						|
            lastvalue = [line[i+1:].lstrip()]
 | 
						|
 | 
						|
        if lastheader:
 | 
						|
            # XXX reconsider the joining of folded lines
 | 
						|
            self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
 | 
						|
 | 
						|
 | 
						|
    def _parsegen(self):
 | 
						|
        # Parse any currently available text
 | 
						|
        self._new_sub_object()
 | 
						|
        self._root = self._cur
 | 
						|
        completing = False
 | 
						|
        last = None
 | 
						|
        
 | 
						|
        for line in self._input:
 | 
						|
            if line is NeedMoreData:
 | 
						|
                yield None # Need More Data
 | 
						|
                continue
 | 
						|
            self._input.unreadline(line)
 | 
						|
            if not completing:
 | 
						|
                headers = []
 | 
						|
                # Now collect all headers.
 | 
						|
                for line in self._input:
 | 
						|
                    if line is NeedMoreData:
 | 
						|
                        yield None # Need More Data
 | 
						|
                        continue
 | 
						|
                    if not self.headerRE.match(line):
 | 
						|
                        self._parse_headers(headers)
 | 
						|
                        # A message/rfc822 has no body and no internal 
 | 
						|
                        # boundary.
 | 
						|
                        if self._cur.get_content_maintype() == "message":
 | 
						|
                            self._new_sub_object()
 | 
						|
                            completing = False
 | 
						|
                            headers = []
 | 
						|
                            continue
 | 
						|
                        if line.strip():
 | 
						|
                            # No blank line between headers and body. 
 | 
						|
                            # Push this line back, it's the first line of 
 | 
						|
                            # the body.
 | 
						|
                            self._input.unreadline(line)
 | 
						|
                        break
 | 
						|
                    else:
 | 
						|
                        headers.append(line)
 | 
						|
                else:
 | 
						|
                    # We're done with the data and are still inside the headers
 | 
						|
                    self._parse_headers(headers)
 | 
						|
 | 
						|
            # Now we're dealing with the body
 | 
						|
            boundary = self._cur.get_boundary()
 | 
						|
            isdigest = (self._cur.get_content_type() == 'multipart/digest')
 | 
						|
            if boundary and not self._cur._finishing:
 | 
						|
                separator = '--' + boundary
 | 
						|
                self._cur._boundaryRE = re.compile(
 | 
						|
                        r'(?P<sep>' + re.escape(separator) +
 | 
						|
                        r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
 | 
						|
                for r in self._input.readuntil(self._cur._boundaryRE):
 | 
						|
                    if r is NeedMoreData:
 | 
						|
                         yield NeedMoreData
 | 
						|
                    else:
 | 
						|
                        preamble, matchobj = r
 | 
						|
                        break
 | 
						|
                if not matchobj:
 | 
						|
                    # Broken - we hit the end of file. Just set the body 
 | 
						|
                    # to the text.
 | 
						|
                    if completing:
 | 
						|
                        self._attach_trailer(last, preamble)
 | 
						|
                    else:
 | 
						|
                        self._attach_preamble(self._cur, preamble)
 | 
						|
                    # XXX move back to the parent container.
 | 
						|
                    self._pop_container()
 | 
						|
                    completing = True
 | 
						|
                    continue
 | 
						|
                if preamble:
 | 
						|
                    if completing:
 | 
						|
                        preamble = preamble[:-len(matchobj.group('linesep'))]
 | 
						|
                        self._attach_trailer(last, preamble)
 | 
						|
                    else:
 | 
						|
                        self._attach_preamble(self._cur, preamble)
 | 
						|
                elif not completing:
 | 
						|
                    # The module docs specify an empty preamble is None, not ''
 | 
						|
                    self._cur.preamble = None
 | 
						|
                    # If we _are_ completing, the last object gets no payload
 | 
						|
 | 
						|
                if matchobj.group('end'):
 | 
						|
                    # That was the end boundary tag. Bounce back to the
 | 
						|
                    # parent container
 | 
						|
                    last = self._pop_container()
 | 
						|
                    self._input.unreadline(matchobj.group('linesep'))
 | 
						|
                    completing = True
 | 
						|
                    continue
 | 
						|
 | 
						|
                # A number of MTAs produced by a nameless large company
 | 
						|
                # we shall call "SicroMoft" produce repeated boundary 
 | 
						|
                # lines.
 | 
						|
                while True:
 | 
						|
                    line = self._input.peekline()
 | 
						|
                    if line is NeedMoreData:
 | 
						|
                        yield None
 | 
						|
                        continue
 | 
						|
                    if self._cur._boundaryRE.match(line):
 | 
						|
                        self._input.readline()
 | 
						|
                    else:
 | 
						|
                        break
 | 
						|
 | 
						|
                self._new_sub_object()
 | 
						|
                
 | 
						|
                completing = False
 | 
						|
                if isdigest:
 | 
						|
                    self._cur.set_default_type('message/rfc822')
 | 
						|
                    continue
 | 
						|
            else:
 | 
						|
                # non-multipart or after end-boundary
 | 
						|
                if last is not self._root:
 | 
						|
                    last = self._pop_container()
 | 
						|
                if self._cur.get_content_maintype() == "message":
 | 
						|
                    # We double-pop to leave the RFC822 object
 | 
						|
                    self._pop_container()
 | 
						|
                    completing = True
 | 
						|
                elif self._cur._boundaryRE and last <> self._root:
 | 
						|
                    completing = True
 | 
						|
                else:
 | 
						|
                    # Non-multipart top level, or in the trailer of the 
 | 
						|
                    # top level multipart
 | 
						|
                    while not self._input.is_done():
 | 
						|
                        yield None
 | 
						|
                    data = list(self._input)
 | 
						|
                    body = EMPTYSTRING.join(data)
 | 
						|
                    self._attach_trailer(last, body)
 | 
						|
 | 
						|
 | 
						|
    def _attach_trailer(self, obj, trailer):
 | 
						|
        #import pdb ; pdb.set_trace()
 | 
						|
        if obj.get_content_maintype() in ( "multipart", "message" ):
 | 
						|
            obj.epilogue = trailer
 | 
						|
        else:
 | 
						|
            obj.set_payload(trailer)
 | 
						|
 | 
						|
    def _attach_preamble(self, obj, trailer):
 | 
						|
        if obj.get_content_maintype() in ( "multipart", "message" ):
 | 
						|
            obj.preamble = trailer
 | 
						|
        else:
 | 
						|
            obj.set_payload(trailer)
 | 
						|
 | 
						|
 | 
						|
    def _new_sub_object(self):
 | 
						|
        new = self._class()
 | 
						|
        #print "pushing", self._objectstack, repr(new)
 | 
						|
        if self._objectstack:
 | 
						|
            self._objectstack[-1].attach(new)
 | 
						|
        self._objectstack.append(new)
 | 
						|
        new._boundaryRE = None
 | 
						|
        new._finishing = False
 | 
						|
        self._cur = new
 | 
						|
 | 
						|
    def _pop_container(self):
 | 
						|
        # Move the pointer to the container of the current object.
 | 
						|
        # Returns the (old) current object
 | 
						|
        #import pdb ; pdb.set_trace()
 | 
						|
        #print "popping", self._objectstack
 | 
						|
        last = self._objectstack.pop()
 | 
						|
        if self._objectstack:
 | 
						|
            self._cur = self._objectstack[-1]
 | 
						|
        else:
 | 
						|
            self._cur._finishing = True
 | 
						|
        return last
 | 
						|
 | 
						|
 |