Get a 3- to 4-fold speedup for sub()/subn(), split() and findall() by

not calling self.search(); instead, call self.code.match() directly and interpret the list of registers it returns directly. This saves the overhead of instantiating a MatchObject for each hit, basically inlining search() as well as group(). When a MatchObject is still needed, one is allocated and reused for the duration of the scan.
2025-10-17 12:18:23 +00:00 · 1998-07-17 20:18:49 +00:00 · 1998-07-17 20:18:49 +00:00 · 0e5ab17ad3
commit 0e5ab17ad3
parent c364cf8228
1 changed files with 71 additions and 38 deletions
--- a/Lib/re.py
+++ b/Lib/re.py
@ -138,42 +138,56 @@ class RegexObject:
        non-overlapping occurrences of the pattern in the source
        string by the replacement repl.  number is the number of
        substitutions that were made."""
-
+        
        if count < 0:
            raise error, "negative substitution count"
        if count == 0:
            import sys
            count = sys.maxint
        if type(repl) == type(''):
            if '\\' in repl:
                repl = lambda m, r=repl: pcre_expand(m, r)
            else:
                repl = lambda m, r=repl: r
        n = 0           # Number of matches
        pos = 0         # Where to start searching
        lastmatch = -1  # End of last match
        results = []    # Substrings making up the result
        end = len(source)
        if type(repl) is type(''):
            # See if repl contains group references
            try:
                repl = pcre_expand(_Dummy, repl)
            except:
                m = MatchObject(self, source, 0, end, [])
                repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
            else:
                m = None
        else:
            m = MatchObject(self, source, 0, end, [])
        match = self.code.match
        append = results.append
        while n < count and pos <= end:
-            m = self.search(source, pos)
+            regs = match(source, pos, end, 0)
-            if not m:
+            if not regs:
                break
-            i, j = m.span(0)
+            i, j = regs[0]
            if i == j == lastmatch:
                # Empty match adjacent to previous match
                pos = pos + 1
-                results.append(source[lastmatch:pos])
+                append(source[lastmatch:pos])
                continue
            if pos < i:
-                results.append(source[pos:i])
+                append(source[pos:i])
-            results.append(repl(m))
+            if m:
                m.pos = pos
                m.regs = regs
                append(repl(m))
            else:
                append(repl)
            pos = lastmatch = j
            if i == j:
                # Last match was empty; don't try here again
                pos = pos + 1
-                results.append(source[lastmatch:pos])
+                append(source[lastmatch:pos])
            n = n + 1
-        results.append(source[pos:])
+        append(source[pos:])
        return (string.join(results, ''), n)
    def split(self, source, maxsplit=0):
@ -183,34 +197,40 @@ class RegexObject:
        if maxsplit < 0:
            raise error, "negative split count"
        if maxsplit == 0:
            import sys
            maxsplit = sys.maxint
        n = 0
        pos = 0
        lastmatch = 0
        results = []
        end = len(source)
        match = self.code.match
        append = results.append
        while n < maxsplit:
-            m = self.search(source, pos)
+            regs = match(source, pos, end, 0)
-            if not m:
+            if not regs:
                break
-            i, j = m.span(0)
+            i, j = regs[0]
            if i == j:
                # Empty match
                if pos >= end:
                    break
                pos = pos+1
                continue
-            results.append(source[lastmatch:i])
+            append(source[lastmatch:i])
-            g = m.groups()
+            rest = regs[1:]
-            if g:
+            if rest:
-                results[len(results):] = list(g)
+                for a, b in rest:
                    if a == -1 or b == -1:
                        group = None
                    else:
                        group = source[a:b]
                    append(group)
            pos = lastmatch = j
            n = n + 1
-        results.append(source[lastmatch:])
+        append(source[lastmatch:])
        return results
-    def findall(self, string):
+    def findall(self, source):
        """Return a list of all non-overlapping matches in the string.
        If one or more groups are present in the pattern, return a
@ -221,20 +241,29 @@ class RegexObject:
        """
        pos = 0
-        n = len(string)
+        end = len(source)
-        result = []
+        results = []
-        while pos <= n:
+        match = self.code.match
-            m = self.search(string, pos)
+        append = results.append
-            if not m:
+        while pos <= end:
            regs = match(source, pos, end, 0)
            if not regs:
                break
-            gr = m.groups()
+            i, j = regs[0]
-            if not gr:
+            rest = regs[1:]
-                gr = m.group()
+            if not rest:
-            elif len(gr) == 1:
+                gr = source[i:j]
-                gr = gr[0]
+            elif len(rest) == 1:
-            result.append(gr)
+                a, b = rest[0]
-            pos = max(m.end(), pos+1)
+                gr = source[a:b]
-        return result
+            else:
                gr = []
                for (a, b) in rest:
                    gr.append(source[a:b])
                gr = tuple(gr)
            append(gr)
            pos = max(j, pos+1)
        return results
    # The following 3 functions were contributed by Mike Fletcher, and
    # allow pickling and unpickling of RegexObject instances.
@ -251,6 +280,10 @@ class RegexObject:
        self.groupindex = statetuple[2]
        self.code = apply(pcre_compile, statetuple)
 class _Dummy:
    # Dummy class used by _subn_string().  Has 'group' to avoid core dump.
    group = None
 class MatchObject:
    def __init__(self, re, string, pos, endpos, regs):