mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
gh-91524: Speed up the regular expression substitution (#91525)
Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. Closes #91524 Primarily authored by serhiy-storchaka Serhiy Storchaka Minor-cleanups-by: Gregory P. Smith [Google] <greg@krypto.org>
This commit is contained in:
parent
176b6c57be
commit
75a6fadf36
9 changed files with 358 additions and 91 deletions
|
@ -984,24 +984,28 @@ def parse(str, flags=0, state=None):
|
|||
|
||||
return p
|
||||
|
||||
def parse_template(source, state):
|
||||
def parse_template(source, pattern):
|
||||
# parse 're' replacement string into list of literals and
|
||||
# group references
|
||||
s = Tokenizer(source)
|
||||
sget = s.get
|
||||
groups = []
|
||||
literals = []
|
||||
result = []
|
||||
literal = []
|
||||
lappend = literal.append
|
||||
def addliteral():
|
||||
if s.istext:
|
||||
result.append(''.join(literal))
|
||||
else:
|
||||
# The tokenizer implicitly decodes bytes objects as latin-1, we must
|
||||
# therefore re-encode the final representation.
|
||||
result.append(''.join(literal).encode('latin-1'))
|
||||
del literal[:]
|
||||
def addgroup(index, pos):
|
||||
if index > state.groups:
|
||||
if index > pattern.groups:
|
||||
raise s.error("invalid group reference %d" % index, pos)
|
||||
if literal:
|
||||
literals.append(''.join(literal))
|
||||
del literal[:]
|
||||
groups.append((len(literals), index))
|
||||
literals.append(None)
|
||||
groupindex = state.groupindex
|
||||
addliteral()
|
||||
result.append(index)
|
||||
groupindex = pattern.groupindex
|
||||
while True:
|
||||
this = sget()
|
||||
if this is None:
|
||||
|
@ -1063,22 +1067,5 @@ def parse_template(source, state):
|
|||
lappend(this)
|
||||
else:
|
||||
lappend(this)
|
||||
if literal:
|
||||
literals.append(''.join(literal))
|
||||
if not isinstance(source, str):
|
||||
# The tokenizer implicitly decodes bytes objects as latin-1, we must
|
||||
# therefore re-encode the final representation.
|
||||
literals = [None if s is None else s.encode('latin-1') for s in literals]
|
||||
return groups, literals
|
||||
|
||||
def expand_template(template, match):
|
||||
g = match.group
|
||||
empty = match.string[:0]
|
||||
groups, literals = template
|
||||
literals = literals[:]
|
||||
try:
|
||||
for index, group in groups:
|
||||
literals[index] = g(group) or empty
|
||||
except IndexError:
|
||||
raise error("invalid group reference %d" % index) from None
|
||||
return empty.join(literals)
|
||||
addliteral()
|
||||
return result
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue