gh-122288: Improve performances of fnmatch.translate (#122289)

Improve performance of this function by a factor of 1.7x.

Co-authored-by: Barney Gale <barney.gale@gmail.com>
This commit is contained in:
Bénédikt Tran 2024-11-27 17:42:45 +01:00 committed by GitHub
parent 14a05a8f43
commit 78cb377c62
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 112 additions and 47 deletions

View file

@ -77,24 +77,30 @@ def translate(pat):
There is no way to quote meta-characters.
"""
STAR = object()
parts = _translate(pat, STAR, '.')
return _join_translated_parts(parts, STAR)
parts, star_indices = _translate(pat, '*', '.')
return _join_translated_parts(parts, star_indices)
_re_setops_sub = re.compile(r'([&~|])').sub
_re_escape = functools.lru_cache(maxsize=512)(re.escape)
def _translate(pat, STAR, QUESTION_MARK):
def _translate(pat, star, question_mark):
res = []
add = res.append
star_indices = []
i, n = 0, len(pat)
while i < n:
c = pat[i]
i = i+1
if c == '*':
# store the position of the wildcard
star_indices.append(len(res))
add(star)
# compress consecutive `*` into one
if (not res) or res[-1] is not STAR:
add(STAR)
while i < n and pat[i] == '*':
i += 1
elif c == '?':
add(QUESTION_MARK)
add(question_mark)
elif c == '[':
j = i
if j < n and pat[j] == '!':
@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK):
# Hyphens that create ranges shouldn't be escaped.
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
for s in chunks)
# Escape set operations (&&, ~~ and ||).
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
i = j+1
if not stuff:
# Empty range: never match.
@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK):
# Negated empty range: match any character.
add('.')
else:
# Escape set operations (&&, ~~ and ||).
stuff = _re_setops_sub(r'\\\1', stuff)
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] in ('^', '['):
stuff = '\\' + stuff
add(f'[{stuff}]')
else:
add(re.escape(c))
add(_re_escape(c))
assert i == n
return res
return res, star_indices
def _join_translated_parts(inp, STAR):
# Deal with STARs.
res = []
add = res.append
i, n = 0, len(inp)
# Fixed pieces at the start?
while i < n and inp[i] is not STAR:
add(inp[i])
i += 1
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
while i < n:
assert inp[i] is STAR
i += 1
if i == n:
add(".*")
break
assert inp[i] is not STAR
fixed = []
while i < n and inp[i] is not STAR:
fixed.append(inp[i])
i += 1
fixed = "".join(fixed)
if i == n:
add(".*")
add(fixed)
else:
add(f"(?>.*?{fixed})")
assert i == n
res = "".join(res)
def _join_translated_parts(parts, star_indices):
if not star_indices:
return fr'(?s:{"".join(parts)})\Z'
iter_star_indices = iter(star_indices)
j = next(iter_star_indices)
buffer = parts[:j] # fixed pieces at the start
append, extend = buffer.append, buffer.extend
i = j + 1
for j in iter_star_indices:
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
append('(?>.*?')
extend(parts[i:j])
append(')')
i = j + 1
append('.*')
extend(parts[i:])
res = ''.join(buffer)
return fr'(?s:{res})\Z'