mirror of
https://github.com/python/cpython.git
synced 2025-07-29 06:05:00 +00:00
GH-72904: Add glob.translate()
function (#106703)
Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`. This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment. In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code. Co-authored-by: Jason R. Coombs <jaraco@jaraco.com> Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
This commit is contained in:
parent
babb787047
commit
cf67ebfb31
7 changed files with 229 additions and 106 deletions
125
Lib/pathlib.py
125
Lib/pathlib.py
|
@ -6,8 +6,8 @@ operating systems.
|
|||
"""
|
||||
|
||||
import contextlib
|
||||
import fnmatch
|
||||
import functools
|
||||
import glob
|
||||
import io
|
||||
import ntpath
|
||||
import os
|
||||
|
@ -76,78 +76,16 @@ def _is_case_sensitive(pathmod):
|
|||
#
|
||||
|
||||
|
||||
# fnmatch.translate() returns a regular expression that includes a prefix and
|
||||
# a suffix, which enable matching newlines and ensure the end of the string is
|
||||
# matched, respectively. These features are undesirable for our implementation
|
||||
# of PurePatch.match(), which represents path separators as newlines and joins
|
||||
# pattern segments together. As a workaround, we define a slice object that
|
||||
# can remove the prefix and suffix from any translate() result. See the
|
||||
# _compile_pattern_lines() function for more details.
|
||||
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
|
||||
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
|
||||
_SWAP_SEP_AND_NEWLINE = {
|
||||
'/': str.maketrans({'/': '\n', '\n': '/'}),
|
||||
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
|
||||
}
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=256)
|
||||
def _compile_pattern(pat, case_sensitive):
|
||||
def _compile_pattern(pat, sep, case_sensitive):
|
||||
"""Compile given glob pattern to a re.Pattern object (observing case
|
||||
sensitivity), or None if the pattern should match everything."""
|
||||
if pat == '*':
|
||||
return None
|
||||
sensitivity)."""
|
||||
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
|
||||
return re.compile(fnmatch.translate(pat), flags).match
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def _compile_pattern_lines(pattern_lines, case_sensitive):
|
||||
"""Compile the given pattern lines to an `re.Pattern` object.
|
||||
|
||||
The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
|
||||
its path separators and newlines swapped (e.g. '**\n*.py`). By using
|
||||
newlines to separate path components, and not setting `re.DOTALL`, we
|
||||
ensure that the `*` wildcard cannot match path separators.
|
||||
|
||||
The returned `re.Pattern` object may have its `match()` method called to
|
||||
match a complete pattern, or `search()` to match from the right. The
|
||||
argument supplied to these methods must also have its path separators and
|
||||
newlines swapped.
|
||||
"""
|
||||
|
||||
# Match the start of the path, or just after a path separator
|
||||
parts = ['^']
|
||||
for part in pattern_lines.splitlines(keepends=True):
|
||||
if part == '*\n':
|
||||
part = r'.+\n'
|
||||
elif part == '*':
|
||||
part = r'.+'
|
||||
elif part == '**\n':
|
||||
# '**/' component: we use '(?s:.)' rather than '.' so that path
|
||||
# separators (i.e. newlines) are matched. The trailing '^' ensures
|
||||
# we terminate after a path separator (i.e. on a new line).
|
||||
part = r'(?s:.)*^'
|
||||
elif part == '**':
|
||||
# '**' component.
|
||||
part = r'(?s:.)*'
|
||||
elif '**' in part:
|
||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||
else:
|
||||
# Any other component: pass to fnmatch.translate(). We slice off
|
||||
# the common prefix and suffix added by translate() to ensure that
|
||||
# re.DOTALL is not set, and the end of the string not matched,
|
||||
# respectively. With DOTALL not set, '*' wildcards will not match
|
||||
# path separators, because the '.' characters in the pattern will
|
||||
# not match newlines.
|
||||
part = fnmatch.translate(part)[_FNMATCH_SLICE]
|
||||
parts.append(part)
|
||||
# Match the end of the path, always.
|
||||
parts.append(r'\Z')
|
||||
flags = re.MULTILINE
|
||||
if not case_sensitive:
|
||||
flags |= re.IGNORECASE
|
||||
return re.compile(''.join(parts), flags=flags)
|
||||
regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep)
|
||||
# The string representation of an empty path is a single dot ('.'). Empty
|
||||
# paths shouldn't match wildcards, so we consume it with an atomic group.
|
||||
regex = r'(\.\Z)?+' + regex
|
||||
return re.compile(regex, flags).match
|
||||
|
||||
|
||||
def _select_children(parent_paths, dir_only, follow_symlinks, match):
|
||||
|
@ -171,7 +109,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
|
|||
except OSError:
|
||||
continue
|
||||
name = entry.name
|
||||
if match is None or match(name):
|
||||
if match(name):
|
||||
yield parent_path._make_child_relpath(name)
|
||||
|
||||
|
||||
|
@ -297,10 +235,6 @@ class PurePath:
|
|||
# to implement comparison methods like `__lt__()`.
|
||||
'_parts_normcase_cached',
|
||||
|
||||
# The `_lines_cached` slot stores the string path with path separators
|
||||
# and newlines swapped. This is used to implement `match()`.
|
||||
'_lines_cached',
|
||||
|
||||
# The `_hash` slot stores the hash of the case-normalized string
|
||||
# path. It's set when `__hash__()` is called for the first time.
|
||||
'_hash',
|
||||
|
@ -475,20 +409,6 @@ class PurePath:
|
|||
self._parts_normcase_cached = self._str_normcase.split(self.pathmod.sep)
|
||||
return self._parts_normcase_cached
|
||||
|
||||
@property
|
||||
def _lines(self):
|
||||
# Path with separators and newlines swapped, for pattern matching.
|
||||
try:
|
||||
return self._lines_cached
|
||||
except AttributeError:
|
||||
path_str = str(self)
|
||||
if path_str == '.':
|
||||
self._lines_cached = ''
|
||||
else:
|
||||
trans = _SWAP_SEP_AND_NEWLINE[self.pathmod.sep]
|
||||
self._lines_cached = path_str.translate(trans)
|
||||
return self._lines_cached
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, PurePath):
|
||||
return NotImplemented
|
||||
|
@ -763,13 +683,16 @@ class PurePath:
|
|||
path_pattern = self.with_segments(path_pattern)
|
||||
if case_sensitive is None:
|
||||
case_sensitive = _is_case_sensitive(self.pathmod)
|
||||
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
|
||||
sep = path_pattern.pathmod.sep
|
||||
pattern_str = str(path_pattern)
|
||||
if path_pattern.drive or path_pattern.root:
|
||||
return pattern.match(self._lines) is not None
|
||||
pass
|
||||
elif path_pattern._tail:
|
||||
return pattern.search(self._lines) is not None
|
||||
pattern_str = f'**{sep}{pattern_str}'
|
||||
else:
|
||||
raise ValueError("empty pattern")
|
||||
match = _compile_pattern(pattern_str, sep, case_sensitive)
|
||||
return match(str(self)) is not None
|
||||
|
||||
|
||||
# Subclassing os.PathLike makes isinstance() checks slower,
|
||||
|
@ -1069,26 +992,19 @@ class _PathBase(PurePath):
|
|||
return contextlib.nullcontext(self.iterdir())
|
||||
|
||||
def _make_child_relpath(self, name):
|
||||
sep = self.pathmod.sep
|
||||
lines_name = name.replace('\n', sep)
|
||||
lines_str = self._lines
|
||||
path_str = str(self)
|
||||
tail = self._tail
|
||||
if tail:
|
||||
path_str = f'{path_str}{sep}{name}'
|
||||
lines_str = f'{lines_str}\n{lines_name}'
|
||||
path_str = f'{path_str}{self.pathmod.sep}{name}'
|
||||
elif path_str != '.':
|
||||
path_str = f'{path_str}{name}'
|
||||
lines_str = f'{lines_str}{lines_name}'
|
||||
else:
|
||||
path_str = name
|
||||
lines_str = lines_name
|
||||
path = self.with_segments(path_str)
|
||||
path._str = path_str
|
||||
path._drv = self.drive
|
||||
path._root = self.root
|
||||
path._tail_cached = tail + [name]
|
||||
path._lines_cached = lines_str
|
||||
return path
|
||||
|
||||
def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
|
||||
|
@ -1139,6 +1055,7 @@ class _PathBase(PurePath):
|
|||
# do not perform any filesystem access, which can be much faster!
|
||||
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
|
||||
deduplicate_paths = False
|
||||
sep = self.pathmod.sep
|
||||
paths = iter([self] if self.is_dir() else [])
|
||||
part_idx = 0
|
||||
while part_idx < len(pattern_parts):
|
||||
|
@ -1159,9 +1076,9 @@ class _PathBase(PurePath):
|
|||
paths = _select_recursive(paths, dir_only, follow_symlinks)
|
||||
|
||||
# Filter out paths that don't match pattern.
|
||||
prefix_len = len(self._make_child_relpath('_')._lines) - 1
|
||||
match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
|
||||
paths = (path for path in paths if match(path._lines[prefix_len:]))
|
||||
prefix_len = len(str(self._make_child_relpath('_'))) - 1
|
||||
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
|
||||
paths = (path for path in paths if match(str(path), prefix_len))
|
||||
return paths
|
||||
|
||||
dir_only = part_idx < len(pattern_parts)
|
||||
|
@ -1174,7 +1091,7 @@ class _PathBase(PurePath):
|
|||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||
else:
|
||||
dir_only = part_idx < len(pattern_parts)
|
||||
match = _compile_pattern(part, case_sensitive)
|
||||
match = _compile_pattern(part, sep, case_sensitive)
|
||||
paths = _select_children(paths, dir_only, follow_symlinks, match)
|
||||
return paths
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue