mirror of
https://github.com/python/cpython.git
synced 2025-08-03 08:34:29 +00:00
gh-91524: Speed up the regular expression substitution (#91525)
Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. Closes #91524 Primarily authored by serhiy-storchaka Serhiy Storchaka Minor-cleanups-by: Gregory P. Smith [Google] <greg@krypto.org>
This commit is contained in:
parent
176b6c57be
commit
75a6fadf36
9 changed files with 358 additions and 91 deletions
|
@ -124,6 +124,7 @@ This module also defines an exception 'error'.
|
|||
import enum
|
||||
from . import _compiler, _parser
|
||||
import functools
|
||||
import _sre
|
||||
|
||||
|
||||
# public symbols
|
||||
|
@ -230,7 +231,7 @@ def purge():
|
|||
"Clear the regular expression caches"
|
||||
_cache.clear()
|
||||
_cache2.clear()
|
||||
_compile_repl.cache_clear()
|
||||
_compile_template.cache_clear()
|
||||
|
||||
def template(pattern, flags=0):
|
||||
"Compile a template pattern, returning a Pattern object, deprecated"
|
||||
|
@ -328,24 +329,9 @@ def _compile(pattern, flags):
|
|||
return p
|
||||
|
||||
@functools.lru_cache(_MAXCACHE)
|
||||
def _compile_repl(repl, pattern):
|
||||
def _compile_template(pattern, repl):
|
||||
# internal: compile replacement pattern
|
||||
return _parser.parse_template(repl, pattern)
|
||||
|
||||
def _expand(pattern, match, template):
|
||||
# internal: Match.expand implementation hook
|
||||
template = _parser.parse_template(template, pattern)
|
||||
return _parser.expand_template(template, match)
|
||||
|
||||
def _subx(pattern, template):
|
||||
# internal: Pattern.sub/subn implementation helper
|
||||
template = _compile_repl(template, pattern)
|
||||
if not template[0] and len(template[1]) == 1:
|
||||
# literal replacement
|
||||
return template[1][0]
|
||||
def filter(match, template=template):
|
||||
return _parser.expand_template(template, match)
|
||||
return filter
|
||||
return _sre.template(pattern, _parser.parse_template(repl, pattern))
|
||||
|
||||
# register myself for pickling
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
# update when constants are added or removed
|
||||
|
||||
MAGIC = 20220615
|
||||
MAGIC = 20221023
|
||||
|
||||
from _sre import MAXREPEAT, MAXGROUPS
|
||||
|
||||
|
|
|
@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):
|
|||
|
||||
return p
|
||||
|
||||
def parse_template(source, state):
|
||||
def parse_template(source, pattern):
|
||||
# parse 're' replacement string into list of literals and
|
||||
# group references
|
||||
s = Tokenizer(source)
|
||||
sget = s.get
|
||||
groups = []
|
||||
literals = []
|
||||
result = []
|
||||
literal = []
|
||||
lappend = literal.append
|
||||
def addliteral():
|
||||
if s.istext:
|
||||
result.append(''.join(literal))
|
||||
else:
|
||||
# The tokenizer implicitly decodes bytes objects as latin-1, we must
|
||||
# therefore re-encode the final representation.
|
||||
result.append(''.join(literal).encode('latin-1'))
|
||||
del literal[:]
|
||||
def addgroup(index, pos):
|
||||
if index > state.groups:
|
||||
if index > pattern.groups:
|
||||
raise s.error("invalid group reference %d" % index, pos)
|
||||
if literal:
|
||||
literals.append(''.join(literal))
|
||||
del literal[:]
|
||||
groups.append((len(literals), index))
|
||||
literals.append(None)
|
||||
groupindex = state.groupindex
|
||||
addliteral()
|
||||
result.append(index)
|
||||
groupindex = pattern.groupindex
|
||||
while True:
|
||||
this = sget()
|
||||
if this is None:
|
||||
|
@ -1063,22 +1067,5 @@ def parse_template(source, state):
|
|||
lappend(this)
|
||||
else:
|
||||
lappend(this)
|
||||
if literal:
|
||||
literals.append(''.join(literal))
|
||||
if not isinstance(source, str):
|
||||
# The tokenizer implicitly decodes bytes objects as latin-1, we must
|
||||
# therefore re-encode the final representation.
|
||||
literals = [None if s is None else s.encode('latin-1') for s in literals]
|
||||
return groups, literals
|
||||
|
||||
def expand_template(template, match):
|
||||
g = match.group
|
||||
empty = match.string[:0]
|
||||
groups, literals = template
|
||||
literals = literals[:]
|
||||
try:
|
||||
for index, group in groups:
|
||||
literals[index] = g(group) or empty
|
||||
except IndexError:
|
||||
raise error("invalid group reference %d" % index) from None
|
||||
return empty.join(literals)
|
||||
addliteral()
|
||||
return result
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue