mirror of
https://github.com/python/cpython.git
synced 2025-08-04 17:08:35 +00:00
#15220: simplify and speed up feedparser's line splitting.
Original patch submitted by QNX, modified for clarity by me (mostly comments). QNX reports a 30% speed up in average email parsing time.
This commit is contained in:
parent
f0bf84c84b
commit
2940e71add
2 changed files with 12 additions and 18 deletions
|
@ -98,24 +98,15 @@ class BufferedSubFile(object):
|
|||
"""Push some new data into this object."""
|
||||
# Handle any previous leftovers
|
||||
data, self._partial = self._partial + data, ''
|
||||
# Crack into lines, but preserve the newlines on the end of each
|
||||
parts = NLCRE_crack.split(data)
|
||||
# The *ahem* interesting behaviour of re.split when supplied grouping
|
||||
# parentheses is that the last element of the resulting list is the
|
||||
# data after the final RE. In the case of a NL/CR terminated string,
|
||||
# this is the empty string.
|
||||
self._partial = parts.pop()
|
||||
#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
|
||||
# is there a \n to follow later?
|
||||
if not self._partial and parts and parts[-1].endswith('\r'):
|
||||
self._partial = parts.pop(-2)+parts.pop()
|
||||
# parts is a list of strings, alternating between the line contents
|
||||
# and the eol character(s). Gather up a list of lines after
|
||||
# re-attaching the newlines.
|
||||
lines = []
|
||||
for i in range(len(parts) // 2):
|
||||
lines.append(parts[i*2] + parts[i*2+1])
|
||||
self.pushlines(lines)
|
||||
# Crack into lines, but preserve the linesep characters on the end of each
|
||||
parts = data.splitlines(True)
|
||||
# If the last element of the list does not end in a newline, then treat
|
||||
# it as a partial line. We only check for '\n' here because a line
|
||||
# ending with '\r' might be a line that was split in the middle of a
|
||||
# '\r\n' sequence (see bugs 1555570 and 1721862).
|
||||
if parts and not parts[-1].endswith('\n'):
|
||||
self._partial = parts.pop()
|
||||
self.pushlines(parts)
|
||||
|
||||
def pushlines(self, lines):
|
||||
# Reverse and insert at the front of the lines.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue