GH-79634: Accept path-like objects as pathlib glob patterns. (#114017)

Allow `os.PathLike` objects to be passed as patterns to `pathlib.Path.glob()` and `rglob()`. (It's already possible to use them in `PurePath.match()`)

While we're in the area:

- Allow empty glob patterns in `PathBase` (but not `Path`)
- Speed up globbing in `PathBase` by generating paths with trailing slashes only as a final step, rather than for every intermediate directory.
- Simplify and speed up handling of rare patterns involving both `**` and `..` segments.
This commit is contained in:
Barney Gale 2024-01-20 02:10:25 +00:00 committed by GitHub
parent 681e9e85a2
commit 6313cdde58
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 115 additions and 72 deletions

View file

@ -1036,6 +1036,9 @@ call fails (for example because the path doesn't exist).
future Python release, patterns with this ending will match both files future Python release, patterns with this ending will match both files
and directories. Add a trailing slash to match only directories. and directories. Add a trailing slash to match only directories.
.. versionchanged:: 3.13
The *pattern* parameter accepts a :term:`path-like object`.
.. method:: Path.group(*, follow_symlinks=True) .. method:: Path.group(*, follow_symlinks=True)
Return the name of the group owning the file. :exc:`KeyError` is raised Return the name of the group owning the file. :exc:`KeyError` is raised
@ -1498,6 +1501,9 @@ call fails (for example because the path doesn't exist).
.. versionchanged:: 3.13 .. versionchanged:: 3.13
The *follow_symlinks* parameter was added. The *follow_symlinks* parameter was added.
.. versionchanged:: 3.13
The *pattern* parameter accepts a :term:`path-like object`.
.. method:: Path.rmdir() .. method:: Path.rmdir()
Remove this directory. The directory must be empty. Remove this directory. The directory must be empty.

View file

@ -467,6 +467,29 @@ class PurePath(_abc.PurePathBase):
from urllib.parse import quote_from_bytes from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path)) return prefix + quote_from_bytes(os.fsencode(path))
@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
parts = self._tail.copy()
pattern = self._raw_path
if self.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
elif parts[-1] == '**':
# GH-70303: '**' only matches directories. Add trailing slash.
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 4)
parts.append('')
parts.reverse()
return parts
# Subclassing os.PathLike makes isinstance() checks slower, # Subclassing os.PathLike makes isinstance() checks slower,
# which in turn makes Path construction slower. Register instead! # which in turn makes Path construction slower. Register instead!
@ -580,7 +603,7 @@ class Path(_abc.PathBase, PurePath):
def _scandir(self): def _scandir(self):
return os.scandir(self) return os.scandir(self)
def _make_child_entry(self, entry, is_dir=False): def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object. # Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str) path = self.with_segments(path_str)
@ -591,6 +614,8 @@ class Path(_abc.PathBase, PurePath):
return path return path
def _make_child_relpath(self, name): def _make_child_relpath(self, name):
if not name:
return self
path_str = str(self) path_str = str(self)
tail = self._tail tail = self._tail
if tail: if tail:
@ -611,14 +636,8 @@ class Path(_abc.PathBase, PurePath):
kind, including directories) matching the given relative pattern. kind, including directories) matching the given relative pattern.
""" """
sys.audit("pathlib.Path.glob", self, pattern) sys.audit("pathlib.Path.glob", self, pattern)
if pattern.endswith('**'): if not isinstance(pattern, PurePath):
# GH-70303: '**' only matches directories. Add trailing slash. pattern = self.with_segments(pattern)
warnings.warn(
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 2)
pattern = f'{pattern}/'
return _abc.PathBase.glob( return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks) self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
@ -628,15 +647,9 @@ class Path(_abc.PathBase, PurePath):
this subtree. this subtree.
""" """
sys.audit("pathlib.Path.rglob", self, pattern) sys.audit("pathlib.Path.rglob", self, pattern)
if pattern.endswith('**'): if not isinstance(pattern, PurePath):
# GH-70303: '**' only matches directories. Add trailing slash. pattern = self.with_segments(pattern)
warnings.warn( pattern = '**' / pattern
"Pattern ending '**' will match files and directories in a "
"future Python release. Add a trailing slash to match only "
"directories and remove this warning.",
FutureWarning, 2)
pattern = f'{pattern}/'
pattern = f'**/{pattern}'
return _abc.PathBase.glob( return _abc.PathBase.glob(
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks) self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)

View file

@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive):
return re.compile(regex, flags=flags).match return re.compile(regex, flags=flags).match
def _select_special(paths, part):
"""Yield special literal children of the given paths."""
for path in paths:
yield path._make_child_relpath(part)
def _select_children(parent_paths, dir_only, follow_symlinks, match): def _select_children(parent_paths, dir_only, follow_symlinks, match):
"""Yield direct children of given paths, filtering by name and type.""" """Yield direct children of given paths, filtering by name and type."""
if follow_symlinks is None: if follow_symlinks is None:
@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError: except OSError:
continue continue
if match(entry.name): if match(entry.name):
yield parent_path._make_child_entry(entry, dir_only) yield parent_path._make_child_entry(entry)
def _select_recursive(parent_paths, dir_only, follow_symlinks): def _select_recursive(parent_paths, dir_only, follow_symlinks):
@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
for entry in entries: for entry in entries:
try: try:
if entry.is_dir(follow_symlinks=follow_symlinks): if entry.is_dir(follow_symlinks=follow_symlinks):
paths.append(path._make_child_entry(entry, dir_only)) paths.append(path._make_child_entry(entry))
continue continue
except OSError: except OSError:
pass pass
@ -427,6 +433,14 @@ class PurePathBase:
a drive).""" a drive)."""
return self.pathmod.isabs(self._raw_path) return self.pathmod.isabs(self._raw_path)
@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
anchor, parts = self._stack
if anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
return parts
def match(self, path_pattern, *, case_sensitive=None): def match(self, path_pattern, *, case_sensitive=None):
""" """
Return True if this path matches the given pattern. Return True if this path matches the given pattern.
@ -436,11 +450,10 @@ class PurePathBase:
if case_sensitive is None: if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.pathmod) case_sensitive = _is_case_sensitive(self.pathmod)
sep = path_pattern.pathmod.sep sep = path_pattern.pathmod.sep
pattern_str = str(path_pattern)
if path_pattern.anchor: if path_pattern.anchor:
pass pattern_str = str(path_pattern)
elif path_pattern.parts: elif path_pattern.parts:
pattern_str = f'**{sep}{pattern_str}' pattern_str = str('**' / path_pattern)
else: else:
raise ValueError("empty pattern") raise ValueError("empty pattern")
match = _compile_pattern(pattern_str, sep, case_sensitive) match = _compile_pattern(pattern_str, sep, case_sensitive)
@ -714,10 +727,8 @@ class PathBase(PurePathBase):
from contextlib import nullcontext from contextlib import nullcontext
return nullcontext(self.iterdir()) return nullcontext(self.iterdir())
def _make_child_entry(self, entry, is_dir=False): def _make_child_entry(self, entry):
# Transform an entry yielded from _scandir() into a path object. # Transform an entry yielded from _scandir() into a path object.
if is_dir:
return entry.joinpath('')
return entry return entry
def _make_child_relpath(self, name): def _make_child_relpath(self, name):
@ -727,57 +738,35 @@ class PathBase(PurePathBase):
"""Iterate over this subtree and yield all existing files (of any """Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern. kind, including directories) matching the given relative pattern.
""" """
path_pattern = self.with_segments(pattern) if not isinstance(pattern, PurePathBase):
if path_pattern.anchor: pattern = self.with_segments(pattern)
raise NotImplementedError("Non-relative patterns are unsupported")
elif not path_pattern.parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
pattern_parts = list(path_pattern.parts)
if not self.pathmod.split(pattern)[1]:
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
pattern_parts.append('')
if case_sensitive is None: if case_sensitive is None:
# TODO: evaluate case-sensitivity of each directory in _select_children(). # TODO: evaluate case-sensitivity of each directory in _select_children().
case_sensitive = _is_case_sensitive(self.pathmod) case_sensitive = _is_case_sensitive(self.pathmod)
# If symlinks are handled consistently, and the pattern does not stack = pattern._pattern_stack
# contain '..' components, then we can use a 'walk-and-match' strategy specials = ('', '.', '..')
# when expanding '**' wildcards. When a '**' wildcard is encountered, filter_paths = False
# all following pattern parts are immediately consumed and used to
# build a `re.Pattern` object. This pattern is used to filter the
# recursive walk. As a result, pattern parts following a '**' wildcard
# do not perform any filesystem access, which can be much faster!
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
deduplicate_paths = False deduplicate_paths = False
sep = self.pathmod.sep sep = self.pathmod.sep
paths = iter([self.joinpath('')] if self.is_dir() else []) paths = iter([self.joinpath('')] if self.is_dir() else [])
part_idx = 0 while stack:
while part_idx < len(pattern_parts): part = stack.pop()
part = pattern_parts[part_idx] if part in specials:
part_idx += 1 paths = _select_special(paths, part)
if part == '':
# Trailing slash.
pass
elif part == '..':
paths = (path._make_child_relpath('..') for path in paths)
elif part == '**': elif part == '**':
# Consume adjacent '**' components. # Consume adjacent '**' components.
while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': while stack and stack[-1] == '**':
part_idx += 1 stack.pop()
if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '': # Consume adjacent non-special components and enable post-walk
dir_only = pattern_parts[-1] == '' # regex filtering, provided we're treating symlinks consistently.
paths = _select_recursive(paths, dir_only, follow_symlinks) if follow_symlinks is not None:
while stack and stack[-1] not in specials:
filter_paths = True
stack.pop()
# Filter out paths that don't match pattern. dir_only = bool(stack)
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
paths = (path for path in paths if match(str(path), prefix_len))
return paths
dir_only = part_idx < len(pattern_parts)
paths = _select_recursive(paths, dir_only, follow_symlinks) paths = _select_recursive(paths, dir_only, follow_symlinks)
if deduplicate_paths: if deduplicate_paths:
# De-duplicate if we've already seen a '**' component. # De-duplicate if we've already seen a '**' component.
@ -786,9 +775,14 @@ class PathBase(PurePathBase):
elif '**' in part: elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component") raise ValueError("Invalid pattern: '**' can only be an entire path component")
else: else:
dir_only = part_idx < len(pattern_parts) dir_only = bool(stack)
match = _compile_pattern(part, sep, case_sensitive) match = _compile_pattern(part, sep, case_sensitive)
paths = _select_children(paths, dir_only, follow_symlinks, match) paths = _select_children(paths, dir_only, follow_symlinks, match)
if filter_paths:
# Filter out paths that don't match pattern.
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(str(pattern), sep, case_sensitive)
paths = (path for path in paths if match(str(path), prefix_len))
return paths return paths
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@ -796,8 +790,10 @@ class PathBase(PurePathBase):
directories) matching the given relative pattern, anywhere in directories) matching the given relative pattern, anywhere in
this subtree. this subtree.
""" """
return self.glob( if not isinstance(pattern, PurePathBase):
f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks) pattern = self.with_segments(pattern)
pattern = '**' / pattern
return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
def walk(self, top_down=True, on_error=None, follow_symlinks=False): def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk().""" """Walk the directory tree from this directory, similar to os.walk()."""

View file

@ -1818,6 +1818,13 @@ class PathTest(test_pathlib_abc.DummyPathTest, PurePathTest):
list(base.walk()) list(base.walk())
list(base.walk(top_down=False)) list(base.walk(top_down=False))
def test_glob_empty_pattern(self):
p = self.cls('')
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob(''))
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
list(p.glob('.'))
def test_glob_many_open_files(self): def test_glob_many_open_files(self):
depth = 30 depth = 30
P = self.cls P = self.cls
@ -1860,6 +1867,22 @@ class PathTest(test_pathlib_abc.DummyPathTest, PurePathTest):
with self.assertWarns(FutureWarning): with self.assertWarns(FutureWarning):
p.rglob('*/**') p.rglob('*/**')
def test_glob_pathlike(self):
P = self.cls
p = P(self.base)
pattern = "dir*/file*"
expect = {p / "dirB/fileB", p / "dirC/fileC"}
self.assertEqual(expect, set(p.glob(P(pattern))))
self.assertEqual(expect, set(p.glob(FakePath(pattern))))
def test_rglob_pathlike(self):
P = self.cls
p = P(self.base, "dirC")
pattern = "**/file*"
expect = {p / "fileC", p / "dirD/fileD"}
self.assertEqual(expect, set(p.rglob(P(pattern))))
self.assertEqual(expect, set(p.rglob(FakePath(pattern))))
@only_posix @only_posix
class PosixPathTest(PathTest, PurePosixPathTest): class PosixPathTest(PathTest, PurePosixPathTest):

View file

@ -1045,9 +1045,12 @@ class DummyPathTest(DummyPurePathTest):
_check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"]) _check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"])
def test_glob_empty_pattern(self): def test_glob_empty_pattern(self):
p = self.cls('') def _check(glob, expected):
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'): self.assertEqual(set(glob), { P(self.base, q) for q in expected })
list(p.glob('')) P = self.cls
p = P(self.base)
_check(p.glob(""), [""])
_check(p.glob("."), ["."])
def test_glob_case_sensitive(self): def test_glob_case_sensitive(self):
P = self.cls P = self.cls

View file

@ -0,0 +1,2 @@
Accept :term:`path-like objects <path-like object>` as patterns in
:meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`.