mirror of
https://github.com/python/cpython.git
synced 2025-10-17 12:18:23 +00:00
Get a 3- to 4-fold speedup for sub()/subn(), split() and findall() by
not calling self.search(); instead, call self.code.match() directly and interpret the list of registers it returns directly. This saves the overhead of instantiating a MatchObject for each hit, basically inlining search() as well as group(). When a MatchObject is still needed, one is allocated and reused for the duration of the scan.
This commit is contained in:
parent
c364cf8228
commit
0e5ab17ad3
1 changed files with 71 additions and 38 deletions
109
Lib/re.py
109
Lib/re.py
|
@ -138,42 +138,56 @@ class RegexObject:
|
||||||
non-overlapping occurrences of the pattern in the source
|
non-overlapping occurrences of the pattern in the source
|
||||||
string by the replacement repl. number is the number of
|
string by the replacement repl. number is the number of
|
||||||
substitutions that were made."""
|
substitutions that were made."""
|
||||||
|
|
||||||
if count < 0:
|
if count < 0:
|
||||||
raise error, "negative substitution count"
|
raise error, "negative substitution count"
|
||||||
if count == 0:
|
if count == 0:
|
||||||
import sys
|
|
||||||
count = sys.maxint
|
count = sys.maxint
|
||||||
if type(repl) == type(''):
|
|
||||||
if '\\' in repl:
|
|
||||||
repl = lambda m, r=repl: pcre_expand(m, r)
|
|
||||||
else:
|
|
||||||
repl = lambda m, r=repl: r
|
|
||||||
n = 0 # Number of matches
|
n = 0 # Number of matches
|
||||||
pos = 0 # Where to start searching
|
pos = 0 # Where to start searching
|
||||||
lastmatch = -1 # End of last match
|
lastmatch = -1 # End of last match
|
||||||
results = [] # Substrings making up the result
|
results = [] # Substrings making up the result
|
||||||
end = len(source)
|
end = len(source)
|
||||||
|
|
||||||
|
if type(repl) is type(''):
|
||||||
|
# See if repl contains group references
|
||||||
|
try:
|
||||||
|
repl = pcre_expand(_Dummy, repl)
|
||||||
|
except:
|
||||||
|
m = MatchObject(self, source, 0, end, [])
|
||||||
|
repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
|
||||||
|
else:
|
||||||
|
m = None
|
||||||
|
else:
|
||||||
|
m = MatchObject(self, source, 0, end, [])
|
||||||
|
|
||||||
|
match = self.code.match
|
||||||
|
append = results.append
|
||||||
while n < count and pos <= end:
|
while n < count and pos <= end:
|
||||||
m = self.search(source, pos)
|
regs = match(source, pos, end, 0)
|
||||||
if not m:
|
if not regs:
|
||||||
break
|
break
|
||||||
i, j = m.span(0)
|
i, j = regs[0]
|
||||||
if i == j == lastmatch:
|
if i == j == lastmatch:
|
||||||
# Empty match adjacent to previous match
|
# Empty match adjacent to previous match
|
||||||
pos = pos + 1
|
pos = pos + 1
|
||||||
results.append(source[lastmatch:pos])
|
append(source[lastmatch:pos])
|
||||||
continue
|
continue
|
||||||
if pos < i:
|
if pos < i:
|
||||||
results.append(source[pos:i])
|
append(source[pos:i])
|
||||||
results.append(repl(m))
|
if m:
|
||||||
|
m.pos = pos
|
||||||
|
m.regs = regs
|
||||||
|
append(repl(m))
|
||||||
|
else:
|
||||||
|
append(repl)
|
||||||
pos = lastmatch = j
|
pos = lastmatch = j
|
||||||
if i == j:
|
if i == j:
|
||||||
# Last match was empty; don't try here again
|
# Last match was empty; don't try here again
|
||||||
pos = pos + 1
|
pos = pos + 1
|
||||||
results.append(source[lastmatch:pos])
|
append(source[lastmatch:pos])
|
||||||
n = n + 1
|
n = n + 1
|
||||||
results.append(source[pos:])
|
append(source[pos:])
|
||||||
return (string.join(results, ''), n)
|
return (string.join(results, ''), n)
|
||||||
|
|
||||||
def split(self, source, maxsplit=0):
|
def split(self, source, maxsplit=0):
|
||||||
|
@ -183,34 +197,40 @@ class RegexObject:
|
||||||
if maxsplit < 0:
|
if maxsplit < 0:
|
||||||
raise error, "negative split count"
|
raise error, "negative split count"
|
||||||
if maxsplit == 0:
|
if maxsplit == 0:
|
||||||
import sys
|
|
||||||
maxsplit = sys.maxint
|
maxsplit = sys.maxint
|
||||||
n = 0
|
n = 0
|
||||||
pos = 0
|
pos = 0
|
||||||
lastmatch = 0
|
lastmatch = 0
|
||||||
results = []
|
results = []
|
||||||
end = len(source)
|
end = len(source)
|
||||||
|
match = self.code.match
|
||||||
|
append = results.append
|
||||||
while n < maxsplit:
|
while n < maxsplit:
|
||||||
m = self.search(source, pos)
|
regs = match(source, pos, end, 0)
|
||||||
if not m:
|
if not regs:
|
||||||
break
|
break
|
||||||
i, j = m.span(0)
|
i, j = regs[0]
|
||||||
if i == j:
|
if i == j:
|
||||||
# Empty match
|
# Empty match
|
||||||
if pos >= end:
|
if pos >= end:
|
||||||
break
|
break
|
||||||
pos = pos+1
|
pos = pos+1
|
||||||
continue
|
continue
|
||||||
results.append(source[lastmatch:i])
|
append(source[lastmatch:i])
|
||||||
g = m.groups()
|
rest = regs[1:]
|
||||||
if g:
|
if rest:
|
||||||
results[len(results):] = list(g)
|
for a, b in rest:
|
||||||
|
if a == -1 or b == -1:
|
||||||
|
group = None
|
||||||
|
else:
|
||||||
|
group = source[a:b]
|
||||||
|
append(group)
|
||||||
pos = lastmatch = j
|
pos = lastmatch = j
|
||||||
n = n + 1
|
n = n + 1
|
||||||
results.append(source[lastmatch:])
|
append(source[lastmatch:])
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def findall(self, string):
|
def findall(self, source):
|
||||||
"""Return a list of all non-overlapping matches in the string.
|
"""Return a list of all non-overlapping matches in the string.
|
||||||
|
|
||||||
If one or more groups are present in the pattern, return a
|
If one or more groups are present in the pattern, return a
|
||||||
|
@ -221,20 +241,29 @@ class RegexObject:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pos = 0
|
pos = 0
|
||||||
n = len(string)
|
end = len(source)
|
||||||
result = []
|
results = []
|
||||||
while pos <= n:
|
match = self.code.match
|
||||||
m = self.search(string, pos)
|
append = results.append
|
||||||
if not m:
|
while pos <= end:
|
||||||
|
regs = match(source, pos, end, 0)
|
||||||
|
if not regs:
|
||||||
break
|
break
|
||||||
gr = m.groups()
|
i, j = regs[0]
|
||||||
if not gr:
|
rest = regs[1:]
|
||||||
gr = m.group()
|
if not rest:
|
||||||
elif len(gr) == 1:
|
gr = source[i:j]
|
||||||
gr = gr[0]
|
elif len(rest) == 1:
|
||||||
result.append(gr)
|
a, b = rest[0]
|
||||||
pos = max(m.end(), pos+1)
|
gr = source[a:b]
|
||||||
return result
|
else:
|
||||||
|
gr = []
|
||||||
|
for (a, b) in rest:
|
||||||
|
gr.append(source[a:b])
|
||||||
|
gr = tuple(gr)
|
||||||
|
append(gr)
|
||||||
|
pos = max(j, pos+1)
|
||||||
|
return results
|
||||||
|
|
||||||
# The following 3 functions were contributed by Mike Fletcher, and
|
# The following 3 functions were contributed by Mike Fletcher, and
|
||||||
# allow pickling and unpickling of RegexObject instances.
|
# allow pickling and unpickling of RegexObject instances.
|
||||||
|
@ -251,6 +280,10 @@ class RegexObject:
|
||||||
self.groupindex = statetuple[2]
|
self.groupindex = statetuple[2]
|
||||||
self.code = apply(pcre_compile, statetuple)
|
self.code = apply(pcre_compile, statetuple)
|
||||||
|
|
||||||
|
class _Dummy:
|
||||||
|
# Dummy class used by _subn_string(). Has 'group' to avoid core dump.
|
||||||
|
group = None
|
||||||
|
|
||||||
class MatchObject:
|
class MatchObject:
|
||||||
|
|
||||||
def __init__(self, re, string, pos, endpos, regs):
|
def __init__(self, re, string, pos, endpos, regs):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue