gh-91524: Speed up the regular expression substitution (#91525)

Functions re.sub() and re.subn() and corresponding re.Pattern methods
are now 2-3 times faster for replacement strings containing group references.

Closes #91524

Primarily authored by serhiy-storchaka Serhiy Storchaka
Minor-cleanups-by: Gregory P. Smith [Google] <greg@krypto.org>
This commit is contained in:
Serhiy Storchaka 2022-10-24 01:57:30 +03:00 committed by GitHub
parent 176b6c57be
commit 75a6fadf36
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 358 additions and 91 deletions

View file

@ -124,6 +124,7 @@ This module also defines an exception 'error'.
import enum
from . import _compiler, _parser
import functools
import _sre
# public symbols
@ -230,7 +231,7 @@ def purge():
"Clear the regular expression caches"
_cache.clear()
_cache2.clear()
_compile_repl.cache_clear()
_compile_template.cache_clear()
def template(pattern, flags=0):
"Compile a template pattern, returning a Pattern object, deprecated"
@ -328,24 +329,9 @@ def _compile(pattern, flags):
return p
@functools.lru_cache(_MAXCACHE)
def _compile_repl(repl, pattern):
def _compile_template(pattern, repl):
# internal: compile replacement pattern
return _parser.parse_template(repl, pattern)
def _expand(pattern, match, template):
# internal: Match.expand implementation hook
template = _parser.parse_template(template, pattern)
return _parser.expand_template(template, match)
def _subx(pattern, template):
# internal: Pattern.sub/subn implementation helper
template = _compile_repl(template, pattern)
if not template[0] and len(template[1]) == 1:
# literal replacement
return template[1][0]
def filter(match, template=template):
return _parser.expand_template(template, match)
return filter
return _sre.template(pattern, _parser.parse_template(repl, pattern))
# register myself for pickling

View file

@ -13,7 +13,7 @@
# update when constants are added or removed
MAGIC = 20220615
MAGIC = 20221023
from _sre import MAXREPEAT, MAXGROUPS

View file

@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):
return p
def parse_template(source, state):
def parse_template(source, pattern):
# parse 're' replacement string into list of literals and
# group references
s = Tokenizer(source)
sget = s.get
groups = []
literals = []
result = []
literal = []
lappend = literal.append
def addliteral():
if s.istext:
result.append(''.join(literal))
else:
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
result.append(''.join(literal).encode('latin-1'))
del literal[:]
def addgroup(index, pos):
if index > state.groups:
if index > pattern.groups:
raise s.error("invalid group reference %d" % index, pos)
if literal:
literals.append(''.join(literal))
del literal[:]
groups.append((len(literals), index))
literals.append(None)
groupindex = state.groupindex
addliteral()
result.append(index)
groupindex = pattern.groupindex
while True:
this = sget()
if this is None:
@ -1063,22 +1067,5 @@ def parse_template(source, state):
lappend(this)
else:
lappend(this)
if literal:
literals.append(''.join(literal))
if not isinstance(source, str):
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
literals = [None if s is None else s.encode('latin-1') for s in literals]
return groups, literals
def expand_template(template, match):
g = match.group
empty = match.string[:0]
groups, literals = template
literals = literals[:]
try:
for index, group in groups:
literals[index] = g(group) or empty
except IndexError:
raise error("invalid group reference %d" % index) from None
return empty.join(literals)
addliteral()
return result