Get a 3- to 4-fold speedup for sub()/subn(), split() and findall() by

not calling self.search(); instead, call self.code.match() directly
and interpret the list of registers it returns directly.  This saves
the overhead of instantiating a MatchObject for each hit, basically
inlining search() as well as group().  When a MatchObject is still
needed, one is allocated and reused for the duration of the scan.
This commit is contained in:
Guido van Rossum 1998-07-17 20:18:49 +00:00
parent c364cf8228
commit 0e5ab17ad3

107
Lib/re.py
View file

@ -142,38 +142,52 @@ class RegexObject:
if count < 0: if count < 0:
raise error, "negative substitution count" raise error, "negative substitution count"
if count == 0: if count == 0:
import sys
count = sys.maxint count = sys.maxint
if type(repl) == type(''):
if '\\' in repl:
repl = lambda m, r=repl: pcre_expand(m, r)
else:
repl = lambda m, r=repl: r
n = 0 # Number of matches n = 0 # Number of matches
pos = 0 # Where to start searching pos = 0 # Where to start searching
lastmatch = -1 # End of last match lastmatch = -1 # End of last match
results = [] # Substrings making up the result results = [] # Substrings making up the result
end = len(source) end = len(source)
if type(repl) is type(''):
# See if repl contains group references
try:
repl = pcre_expand(_Dummy, repl)
except:
m = MatchObject(self, source, 0, end, [])
repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
else:
m = None
else:
m = MatchObject(self, source, 0, end, [])
match = self.code.match
append = results.append
while n < count and pos <= end: while n < count and pos <= end:
m = self.search(source, pos) regs = match(source, pos, end, 0)
if not m: if not regs:
break break
i, j = m.span(0) i, j = regs[0]
if i == j == lastmatch: if i == j == lastmatch:
# Empty match adjacent to previous match # Empty match adjacent to previous match
pos = pos + 1 pos = pos + 1
results.append(source[lastmatch:pos]) append(source[lastmatch:pos])
continue continue
if pos < i: if pos < i:
results.append(source[pos:i]) append(source[pos:i])
results.append(repl(m)) if m:
m.pos = pos
m.regs = regs
append(repl(m))
else:
append(repl)
pos = lastmatch = j pos = lastmatch = j
if i == j: if i == j:
# Last match was empty; don't try here again # Last match was empty; don't try here again
pos = pos + 1 pos = pos + 1
results.append(source[lastmatch:pos]) append(source[lastmatch:pos])
n = n + 1 n = n + 1
results.append(source[pos:]) append(source[pos:])
return (string.join(results, ''), n) return (string.join(results, ''), n)
def split(self, source, maxsplit=0): def split(self, source, maxsplit=0):
@ -183,34 +197,40 @@ class RegexObject:
if maxsplit < 0: if maxsplit < 0:
raise error, "negative split count" raise error, "negative split count"
if maxsplit == 0: if maxsplit == 0:
import sys
maxsplit = sys.maxint maxsplit = sys.maxint
n = 0 n = 0
pos = 0 pos = 0
lastmatch = 0 lastmatch = 0
results = [] results = []
end = len(source) end = len(source)
match = self.code.match
append = results.append
while n < maxsplit: while n < maxsplit:
m = self.search(source, pos) regs = match(source, pos, end, 0)
if not m: if not regs:
break break
i, j = m.span(0) i, j = regs[0]
if i == j: if i == j:
# Empty match # Empty match
if pos >= end: if pos >= end:
break break
pos = pos+1 pos = pos+1
continue continue
results.append(source[lastmatch:i]) append(source[lastmatch:i])
g = m.groups() rest = regs[1:]
if g: if rest:
results[len(results):] = list(g) for a, b in rest:
if a == -1 or b == -1:
group = None
else:
group = source[a:b]
append(group)
pos = lastmatch = j pos = lastmatch = j
n = n + 1 n = n + 1
results.append(source[lastmatch:]) append(source[lastmatch:])
return results return results
def findall(self, string): def findall(self, source):
"""Return a list of all non-overlapping matches in the string. """Return a list of all non-overlapping matches in the string.
If one or more groups are present in the pattern, return a If one or more groups are present in the pattern, return a
@ -221,20 +241,29 @@ class RegexObject:
""" """
pos = 0 pos = 0
n = len(string) end = len(source)
result = [] results = []
while pos <= n: match = self.code.match
m = self.search(string, pos) append = results.append
if not m: while pos <= end:
regs = match(source, pos, end, 0)
if not regs:
break break
gr = m.groups() i, j = regs[0]
if not gr: rest = regs[1:]
gr = m.group() if not rest:
elif len(gr) == 1: gr = source[i:j]
gr = gr[0] elif len(rest) == 1:
result.append(gr) a, b = rest[0]
pos = max(m.end(), pos+1) gr = source[a:b]
return result else:
gr = []
for (a, b) in rest:
gr.append(source[a:b])
gr = tuple(gr)
append(gr)
pos = max(j, pos+1)
return results
# The following 3 functions were contributed by Mike Fletcher, and # The following 3 functions were contributed by Mike Fletcher, and
# allow pickling and unpickling of RegexObject instances. # allow pickling and unpickling of RegexObject instances.
@ -251,6 +280,10 @@ class RegexObject:
self.groupindex = statetuple[2] self.groupindex = statetuple[2]
self.code = apply(pcre_compile, statetuple) self.code = apply(pcre_compile, statetuple)
class _Dummy:
# Dummy class used by _subn_string(). Has 'group' to avoid core dump.
group = None
class MatchObject: class MatchObject:
def __init__(self, re, string, pos, endpos, regs): def __init__(self, re, string, pos, endpos, regs):