mirror of
https://github.com/python/cpython.git
synced 2025-07-16 15:55:18 +00:00
GH-73435: Implement recursive wildcards in pathlib.PurePath.match()
(#101398)
`PurePath.match()` now handles the `**` wildcard as in `Path.glob()`, i.e. it matches any number of path segments. We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com> Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
This commit is contained in:
parent
4c770617c0
commit
49f90ba1ea
5 changed files with 123 additions and 15 deletions
|
@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
|
|||
>>> PurePath('a/b.py').match('/*.py')
|
||||
False
|
||||
|
||||
The *pattern* may be another path object; this speeds up matching the same
|
||||
pattern against multiple files::
|
||||
|
||||
>>> pattern = PurePath('*.py')
|
||||
>>> PurePath('a/b.py').match(pattern)
|
||||
True
|
||||
|
||||
As with other methods, case-sensitivity follows platform defaults::
|
||||
|
||||
>>> PurePosixPath('b.py').match('*.PY')
|
||||
|
@ -581,6 +588,10 @@ Pure paths provide the following methods and properties:
|
|||
.. versionadded:: 3.12
|
||||
The *case_sensitive* argument.
|
||||
|
||||
.. versionchanged:: 3.13
|
||||
Support for the recursive wildcard "``**``" was added. In previous
|
||||
versions, it acted like the non-recursive wildcard "``*``".
|
||||
|
||||
|
||||
.. method:: PurePath.relative_to(other, walk_up=False)
|
||||
|
||||
|
|
|
@ -90,6 +90,9 @@ Improved Modules
|
|||
pathlib
|
||||
-------
|
||||
|
||||
* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
|
||||
(Contributed by Barney Gale in :gh:`73435`.)
|
||||
|
||||
* Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and
|
||||
:meth:`~pathlib.Path.rglob`.
|
||||
(Contributed by Barney Gale in :gh:`77609`.)
|
||||
|
|
|
@ -54,6 +54,7 @@ def _ignore_error(exception):
|
|||
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)
|
||||
|
||||
|
||||
@functools.cache
|
||||
def _is_case_sensitive(flavour):
|
||||
return flavour.normcase('Aa') == 'Aa'
|
||||
|
||||
|
@ -61,6 +62,22 @@ def _is_case_sensitive(flavour):
|
|||
# Globbing helpers
|
||||
#
|
||||
|
||||
|
||||
# fnmatch.translate() returns a regular expression that includes a prefix and
|
||||
# a suffix, which enable matching newlines and ensure the end of the string is
|
||||
# matched, respectively. These features are undesirable for our implementation
|
||||
# of PurePatch.match(), which represents path separators as newlines and joins
|
||||
# pattern segments together. As a workaround, we define a slice object that
|
||||
# can remove the prefix and suffix from any translate() result. See the
|
||||
# _compile_pattern_lines() function for more details.
|
||||
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
|
||||
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
|
||||
_SWAP_SEP_AND_NEWLINE = {
|
||||
'/': str.maketrans({'/': '\n', '\n': '/'}),
|
||||
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
|
||||
}
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def _make_selector(pattern_parts, flavour, case_sensitive):
|
||||
pat = pattern_parts[0]
|
||||
|
@ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive):
|
|||
return re.compile(fnmatch.translate(pat), flags).match
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def _compile_pattern_lines(pattern_lines, case_sensitive):
|
||||
"""Compile the given pattern lines to an `re.Pattern` object.
|
||||
|
||||
The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
|
||||
its path separators and newlines swapped (e.g. '**\n*.py`). By using
|
||||
newlines to separate path components, and not setting `re.DOTALL`, we
|
||||
ensure that the `*` wildcard cannot match path separators.
|
||||
|
||||
The returned `re.Pattern` object may have its `match()` method called to
|
||||
match a complete pattern, or `search()` to match from the right. The
|
||||
argument supplied to these methods must also have its path separators and
|
||||
newlines swapped.
|
||||
"""
|
||||
|
||||
# Match the start of the path, or just after a path separator
|
||||
parts = ['^']
|
||||
for part in pattern_lines.splitlines(keepends=True):
|
||||
if part == '**\n':
|
||||
# '**/' component: we use '[\s\S]' rather than '.' so that path
|
||||
# separators (i.e. newlines) are matched. The trailing '^' ensures
|
||||
# we terminate after a path separator (i.e. on a new line).
|
||||
part = r'[\s\S]*^'
|
||||
elif part == '**':
|
||||
# '**' component.
|
||||
part = r'[\s\S]*'
|
||||
elif '**' in part:
|
||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||
else:
|
||||
# Any other component: pass to fnmatch.translate(). We slice off
|
||||
# the common prefix and suffix added by translate() to ensure that
|
||||
# re.DOTALL is not set, and the end of the string not matched,
|
||||
# respectively. With DOTALL not set, '*' wildcards will not match
|
||||
# path separators, because the '.' characters in the pattern will
|
||||
# not match newlines.
|
||||
part = fnmatch.translate(part)[_FNMATCH_SLICE]
|
||||
parts.append(part)
|
||||
# Match the end of the path, always.
|
||||
parts.append(r'\Z')
|
||||
flags = re.MULTILINE
|
||||
if not case_sensitive:
|
||||
flags |= re.IGNORECASE
|
||||
return re.compile(''.join(parts), flags=flags)
|
||||
|
||||
|
||||
class _Selector:
|
||||
"""A selector matches a specific glob pattern part against the children
|
||||
of a given path."""
|
||||
|
@ -276,6 +338,10 @@ class PurePath:
|
|||
# to implement comparison methods like `__lt__()`.
|
||||
'_parts_normcase_cached',
|
||||
|
||||
# The `_lines_cached` slot stores the string path with path separators
|
||||
# and newlines swapped. This is used to implement `match()`.
|
||||
'_lines_cached',
|
||||
|
||||
# The `_hash` slot stores the hash of the case-normalized string
|
||||
# path. It's set when `__hash__()` is called for the first time.
|
||||
'_hash',
|
||||
|
@ -441,6 +507,16 @@ class PurePath:
|
|||
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
|
||||
return self._parts_normcase_cached
|
||||
|
||||
@property
|
||||
def _lines(self):
|
||||
# Path with separators and newlines swapped, for pattern matching.
|
||||
try:
|
||||
return self._lines_cached
|
||||
except AttributeError:
|
||||
trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
|
||||
self._lines_cached = str(self).translate(trans)
|
||||
return self._lines_cached
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, PurePath):
|
||||
return NotImplemented
|
||||
|
@ -697,23 +773,18 @@ class PurePath:
|
|||
"""
|
||||
Return True if this path matches the given pattern.
|
||||
"""
|
||||
if not isinstance(path_pattern, PurePath):
|
||||
path_pattern = self.with_segments(path_pattern)
|
||||
if case_sensitive is None:
|
||||
case_sensitive = _is_case_sensitive(self._flavour)
|
||||
pat = self.with_segments(path_pattern)
|
||||
if not pat.parts:
|
||||
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
|
||||
if path_pattern.drive or path_pattern.root:
|
||||
return pattern.match(self._lines) is not None
|
||||
elif path_pattern._tail:
|
||||
return pattern.search(self._lines) is not None
|
||||
else:
|
||||
raise ValueError("empty pattern")
|
||||
pat_parts = pat.parts
|
||||
parts = self.parts
|
||||
if pat.drive or pat.root:
|
||||
if len(pat_parts) != len(parts):
|
||||
return False
|
||||
elif len(pat_parts) > len(parts):
|
||||
return False
|
||||
for part, pat in zip(reversed(parts), reversed(pat_parts)):
|
||||
match = _compile_pattern(pat, case_sensitive)
|
||||
if not match(part):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# Subclassing os.PathLike makes isinstance() checks slower,
|
||||
# which in turn makes Path construction slower. Register instead!
|
||||
|
|
|
@ -310,8 +310,30 @@ class _BasePurePathTest(object):
|
|||
self.assertFalse(P('/ab.py').match('/a/*.py'))
|
||||
self.assertFalse(P('/a/b/c.py').match('/a/*.py'))
|
||||
# Multi-part glob-style pattern.
|
||||
self.assertFalse(P('/a/b/c.py').match('/**/*.py'))
|
||||
self.assertTrue(P('a').match('**'))
|
||||
self.assertTrue(P('c.py').match('**'))
|
||||
self.assertTrue(P('a/b/c.py').match('**'))
|
||||
self.assertTrue(P('/a/b/c.py').match('**'))
|
||||
self.assertTrue(P('/a/b/c.py').match('/**'))
|
||||
self.assertTrue(P('/a/b/c.py').match('**/'))
|
||||
self.assertTrue(P('/a/b/c.py').match('/a/**'))
|
||||
self.assertTrue(P('/a/b/c.py').match('**/*.py'))
|
||||
self.assertTrue(P('/a/b/c.py').match('/**/*.py'))
|
||||
self.assertTrue(P('/a/b/c.py').match('/a/**/*.py'))
|
||||
self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py'))
|
||||
self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py'))
|
||||
self.assertFalse(P('c.py').match('**/a.py'))
|
||||
self.assertFalse(P('c.py').match('c/**'))
|
||||
self.assertFalse(P('a/b/c.py').match('**/a'))
|
||||
self.assertFalse(P('a/b/c.py').match('**/a/b'))
|
||||
self.assertFalse(P('a/b/c.py').match('**/a/b/c'))
|
||||
self.assertFalse(P('a/b/c.py').match('**/a/b/c.'))
|
||||
self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
|
||||
self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
|
||||
self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**'))
|
||||
self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py'))
|
||||
self.assertRaises(ValueError, P('a').match, '**a/b/c')
|
||||
self.assertRaises(ValueError, P('a').match, 'a/b/c**')
|
||||
# Case-sensitive flag
|
||||
self.assertFalse(P('A.py').match('a.PY', case_sensitive=True))
|
||||
self.assertTrue(P('A.py').match('a.PY', case_sensitive=False))
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
|
Loading…
Add table
Add a link
Reference in a new issue