Get a 3- to 4-fold speedup for sub()/subn(), split() and findall() by

not calling self.search(); instead, call self.code.match() directly
and interpret the list of registers it returns directly.  This saves
the overhead of instantiating a MatchObject for each hit, basically
inlining search() as well as group().  When a MatchObject is still
needed, one is allocated and reused for the duration of the scan.
This commit is contained in:
Guido van Rossum 1998-07-17 20:18:49 +00:00
parent c364cf8228
commit 0e5ab17ad3

109
Lib/re.py
View file

@ -138,42 +138,56 @@ class RegexObject:
non-overlapping occurrences of the pattern in the source non-overlapping occurrences of the pattern in the source
string by the replacement repl. number is the number of string by the replacement repl. number is the number of
substitutions that were made.""" substitutions that were made."""
if count < 0: if count < 0:
raise error, "negative substitution count" raise error, "negative substitution count"
if count == 0: if count == 0:
import sys
count = sys.maxint count = sys.maxint
if type(repl) == type(''):
if '\\' in repl:
repl = lambda m, r=repl: pcre_expand(m, r)
else:
repl = lambda m, r=repl: r
n = 0 # Number of matches n = 0 # Number of matches
pos = 0 # Where to start searching pos = 0 # Where to start searching
lastmatch = -1 # End of last match lastmatch = -1 # End of last match
results = [] # Substrings making up the result results = [] # Substrings making up the result
end = len(source) end = len(source)
if type(repl) is type(''):
# See if repl contains group references
try:
repl = pcre_expand(_Dummy, repl)
except:
m = MatchObject(self, source, 0, end, [])
repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
else:
m = None
else:
m = MatchObject(self, source, 0, end, [])
match = self.code.match
append = results.append
while n < count and pos <= end: while n < count and pos <= end:
m = self.search(source, pos) regs = match(source, pos, end, 0)
if not m: if not regs:
break break
i, j = m.span(0) i, j = regs[0]
if i == j == lastmatch: if i == j == lastmatch:
# Empty match adjacent to previous match # Empty match adjacent to previous match
pos = pos + 1 pos = pos + 1
results.append(source[lastmatch:pos]) append(source[lastmatch:pos])
continue continue
if pos < i: if pos < i:
results.append(source[pos:i]) append(source[pos:i])
results.append(repl(m)) if m:
m.pos = pos
m.regs = regs
append(repl(m))
else:
append(repl)
pos = lastmatch = j pos = lastmatch = j
if i == j: if i == j:
# Last match was empty; don't try here again # Last match was empty; don't try here again
pos = pos + 1 pos = pos + 1
results.append(source[lastmatch:pos]) append(source[lastmatch:pos])
n = n + 1 n = n + 1
results.append(source[pos:]) append(source[pos:])
return (string.join(results, ''), n) return (string.join(results, ''), n)
def split(self, source, maxsplit=0): def split(self, source, maxsplit=0):
@ -183,34 +197,40 @@ class RegexObject:
if maxsplit < 0: if maxsplit < 0:
raise error, "negative split count" raise error, "negative split count"
if maxsplit == 0: if maxsplit == 0:
import sys
maxsplit = sys.maxint maxsplit = sys.maxint
n = 0 n = 0
pos = 0 pos = 0
lastmatch = 0 lastmatch = 0
results = [] results = []
end = len(source) end = len(source)
match = self.code.match
append = results.append
while n < maxsplit: while n < maxsplit:
m = self.search(source, pos) regs = match(source, pos, end, 0)
if not m: if not regs:
break break
i, j = m.span(0) i, j = regs[0]
if i == j: if i == j:
# Empty match # Empty match
if pos >= end: if pos >= end:
break break
pos = pos+1 pos = pos+1
continue continue
results.append(source[lastmatch:i]) append(source[lastmatch:i])
g = m.groups() rest = regs[1:]
if g: if rest:
results[len(results):] = list(g) for a, b in rest:
if a == -1 or b == -1:
group = None
else:
group = source[a:b]
append(group)
pos = lastmatch = j pos = lastmatch = j
n = n + 1 n = n + 1
results.append(source[lastmatch:]) append(source[lastmatch:])
return results return results
def findall(self, string): def findall(self, source):
"""Return a list of all non-overlapping matches in the string. """Return a list of all non-overlapping matches in the string.
If one or more groups are present in the pattern, return a If one or more groups are present in the pattern, return a
@ -221,20 +241,29 @@ class RegexObject:
""" """
pos = 0 pos = 0
n = len(string) end = len(source)
result = [] results = []
while pos <= n: match = self.code.match
m = self.search(string, pos) append = results.append
if not m: while pos <= end:
regs = match(source, pos, end, 0)
if not regs:
break break
gr = m.groups() i, j = regs[0]
if not gr: rest = regs[1:]
gr = m.group() if not rest:
elif len(gr) == 1: gr = source[i:j]
gr = gr[0] elif len(rest) == 1:
result.append(gr) a, b = rest[0]
pos = max(m.end(), pos+1) gr = source[a:b]
return result else:
gr = []
for (a, b) in rest:
gr.append(source[a:b])
gr = tuple(gr)
append(gr)
pos = max(j, pos+1)
return results
# The following 3 functions were contributed by Mike Fletcher, and # The following 3 functions were contributed by Mike Fletcher, and
# allow pickling and unpickling of RegexObject instances. # allow pickling and unpickling of RegexObject instances.
@ -251,6 +280,10 @@ class RegexObject:
self.groupindex = statetuple[2] self.groupindex = statetuple[2]
self.code = apply(pcre_compile, statetuple) self.code = apply(pcre_compile, statetuple)
class _Dummy:
# Dummy class used by _subn_string(). Has 'group' to avoid core dump.
group = None
class MatchObject: class MatchObject:
def __init__(self, re, string, pos, endpos, regs): def __init__(self, re, string, pos, endpos, regs):