mirror of
https://github.com/python/cpython.git
synced 2025-07-14 23:05:17 +00:00
GH-79634: Accept path-like objects as pathlib glob patterns. (#114017)
Allow `os.PathLike` objects to be passed as patterns to `pathlib.Path.glob()` and `rglob()`. (It's already possible to use them in `PurePath.match()`) While we're in the area: - Allow empty glob patterns in `PathBase` (but not `Path`) - Speed up globbing in `PathBase` by generating paths with trailing slashes only as a final step, rather than for every intermediate directory. - Simplify and speed up handling of rare patterns involving both `**` and `..` segments.
This commit is contained in:
parent
681e9e85a2
commit
6313cdde58
6 changed files with 115 additions and 72 deletions
|
@ -1036,6 +1036,9 @@ call fails (for example because the path doesn't exist).
|
||||||
future Python release, patterns with this ending will match both files
|
future Python release, patterns with this ending will match both files
|
||||||
and directories. Add a trailing slash to match only directories.
|
and directories. Add a trailing slash to match only directories.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.13
|
||||||
|
The *pattern* parameter accepts a :term:`path-like object`.
|
||||||
|
|
||||||
.. method:: Path.group(*, follow_symlinks=True)
|
.. method:: Path.group(*, follow_symlinks=True)
|
||||||
|
|
||||||
Return the name of the group owning the file. :exc:`KeyError` is raised
|
Return the name of the group owning the file. :exc:`KeyError` is raised
|
||||||
|
@ -1498,6 +1501,9 @@ call fails (for example because the path doesn't exist).
|
||||||
.. versionchanged:: 3.13
|
.. versionchanged:: 3.13
|
||||||
The *follow_symlinks* parameter was added.
|
The *follow_symlinks* parameter was added.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.13
|
||||||
|
The *pattern* parameter accepts a :term:`path-like object`.
|
||||||
|
|
||||||
.. method:: Path.rmdir()
|
.. method:: Path.rmdir()
|
||||||
|
|
||||||
Remove this directory. The directory must be empty.
|
Remove this directory. The directory must be empty.
|
||||||
|
|
|
@ -467,6 +467,29 @@ class PurePath(_abc.PurePathBase):
|
||||||
from urllib.parse import quote_from_bytes
|
from urllib.parse import quote_from_bytes
|
||||||
return prefix + quote_from_bytes(os.fsencode(path))
|
return prefix + quote_from_bytes(os.fsencode(path))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _pattern_stack(self):
|
||||||
|
"""Stack of path components, to be used with patterns in glob()."""
|
||||||
|
parts = self._tail.copy()
|
||||||
|
pattern = self._raw_path
|
||||||
|
if self.anchor:
|
||||||
|
raise NotImplementedError("Non-relative patterns are unsupported")
|
||||||
|
elif not parts:
|
||||||
|
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
|
||||||
|
elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
|
||||||
|
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
|
||||||
|
parts.append('')
|
||||||
|
elif parts[-1] == '**':
|
||||||
|
# GH-70303: '**' only matches directories. Add trailing slash.
|
||||||
|
warnings.warn(
|
||||||
|
"Pattern ending '**' will match files and directories in a "
|
||||||
|
"future Python release. Add a trailing slash to match only "
|
||||||
|
"directories and remove this warning.",
|
||||||
|
FutureWarning, 4)
|
||||||
|
parts.append('')
|
||||||
|
parts.reverse()
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
# Subclassing os.PathLike makes isinstance() checks slower,
|
# Subclassing os.PathLike makes isinstance() checks slower,
|
||||||
# which in turn makes Path construction slower. Register instead!
|
# which in turn makes Path construction slower. Register instead!
|
||||||
|
@ -580,7 +603,7 @@ class Path(_abc.PathBase, PurePath):
|
||||||
def _scandir(self):
|
def _scandir(self):
|
||||||
return os.scandir(self)
|
return os.scandir(self)
|
||||||
|
|
||||||
def _make_child_entry(self, entry, is_dir=False):
|
def _make_child_entry(self, entry):
|
||||||
# Transform an entry yielded from _scandir() into a path object.
|
# Transform an entry yielded from _scandir() into a path object.
|
||||||
path_str = entry.name if str(self) == '.' else entry.path
|
path_str = entry.name if str(self) == '.' else entry.path
|
||||||
path = self.with_segments(path_str)
|
path = self.with_segments(path_str)
|
||||||
|
@ -591,6 +614,8 @@ class Path(_abc.PathBase, PurePath):
|
||||||
return path
|
return path
|
||||||
|
|
||||||
def _make_child_relpath(self, name):
|
def _make_child_relpath(self, name):
|
||||||
|
if not name:
|
||||||
|
return self
|
||||||
path_str = str(self)
|
path_str = str(self)
|
||||||
tail = self._tail
|
tail = self._tail
|
||||||
if tail:
|
if tail:
|
||||||
|
@ -611,14 +636,8 @@ class Path(_abc.PathBase, PurePath):
|
||||||
kind, including directories) matching the given relative pattern.
|
kind, including directories) matching the given relative pattern.
|
||||||
"""
|
"""
|
||||||
sys.audit("pathlib.Path.glob", self, pattern)
|
sys.audit("pathlib.Path.glob", self, pattern)
|
||||||
if pattern.endswith('**'):
|
if not isinstance(pattern, PurePath):
|
||||||
# GH-70303: '**' only matches directories. Add trailing slash.
|
pattern = self.with_segments(pattern)
|
||||||
warnings.warn(
|
|
||||||
"Pattern ending '**' will match files and directories in a "
|
|
||||||
"future Python release. Add a trailing slash to match only "
|
|
||||||
"directories and remove this warning.",
|
|
||||||
FutureWarning, 2)
|
|
||||||
pattern = f'{pattern}/'
|
|
||||||
return _abc.PathBase.glob(
|
return _abc.PathBase.glob(
|
||||||
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
|
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
|
||||||
|
|
||||||
|
@ -628,15 +647,9 @@ class Path(_abc.PathBase, PurePath):
|
||||||
this subtree.
|
this subtree.
|
||||||
"""
|
"""
|
||||||
sys.audit("pathlib.Path.rglob", self, pattern)
|
sys.audit("pathlib.Path.rglob", self, pattern)
|
||||||
if pattern.endswith('**'):
|
if not isinstance(pattern, PurePath):
|
||||||
# GH-70303: '**' only matches directories. Add trailing slash.
|
pattern = self.with_segments(pattern)
|
||||||
warnings.warn(
|
pattern = '**' / pattern
|
||||||
"Pattern ending '**' will match files and directories in a "
|
|
||||||
"future Python release. Add a trailing slash to match only "
|
|
||||||
"directories and remove this warning.",
|
|
||||||
FutureWarning, 2)
|
|
||||||
pattern = f'{pattern}/'
|
|
||||||
pattern = f'**/{pattern}'
|
|
||||||
return _abc.PathBase.glob(
|
return _abc.PathBase.glob(
|
||||||
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
|
self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
|
||||||
|
|
||||||
|
|
|
@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive):
|
||||||
return re.compile(regex, flags=flags).match
|
return re.compile(regex, flags=flags).match
|
||||||
|
|
||||||
|
|
||||||
|
def _select_special(paths, part):
|
||||||
|
"""Yield special literal children of the given paths."""
|
||||||
|
for path in paths:
|
||||||
|
yield path._make_child_relpath(part)
|
||||||
|
|
||||||
|
|
||||||
def _select_children(parent_paths, dir_only, follow_symlinks, match):
|
def _select_children(parent_paths, dir_only, follow_symlinks, match):
|
||||||
"""Yield direct children of given paths, filtering by name and type."""
|
"""Yield direct children of given paths, filtering by name and type."""
|
||||||
if follow_symlinks is None:
|
if follow_symlinks is None:
|
||||||
|
@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
|
||||||
except OSError:
|
except OSError:
|
||||||
continue
|
continue
|
||||||
if match(entry.name):
|
if match(entry.name):
|
||||||
yield parent_path._make_child_entry(entry, dir_only)
|
yield parent_path._make_child_entry(entry)
|
||||||
|
|
||||||
|
|
||||||
def _select_recursive(parent_paths, dir_only, follow_symlinks):
|
def _select_recursive(parent_paths, dir_only, follow_symlinks):
|
||||||
|
@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
try:
|
try:
|
||||||
if entry.is_dir(follow_symlinks=follow_symlinks):
|
if entry.is_dir(follow_symlinks=follow_symlinks):
|
||||||
paths.append(path._make_child_entry(entry, dir_only))
|
paths.append(path._make_child_entry(entry))
|
||||||
continue
|
continue
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
@ -427,6 +433,14 @@ class PurePathBase:
|
||||||
a drive)."""
|
a drive)."""
|
||||||
return self.pathmod.isabs(self._raw_path)
|
return self.pathmod.isabs(self._raw_path)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _pattern_stack(self):
|
||||||
|
"""Stack of path components, to be used with patterns in glob()."""
|
||||||
|
anchor, parts = self._stack
|
||||||
|
if anchor:
|
||||||
|
raise NotImplementedError("Non-relative patterns are unsupported")
|
||||||
|
return parts
|
||||||
|
|
||||||
def match(self, path_pattern, *, case_sensitive=None):
|
def match(self, path_pattern, *, case_sensitive=None):
|
||||||
"""
|
"""
|
||||||
Return True if this path matches the given pattern.
|
Return True if this path matches the given pattern.
|
||||||
|
@ -436,11 +450,10 @@ class PurePathBase:
|
||||||
if case_sensitive is None:
|
if case_sensitive is None:
|
||||||
case_sensitive = _is_case_sensitive(self.pathmod)
|
case_sensitive = _is_case_sensitive(self.pathmod)
|
||||||
sep = path_pattern.pathmod.sep
|
sep = path_pattern.pathmod.sep
|
||||||
pattern_str = str(path_pattern)
|
|
||||||
if path_pattern.anchor:
|
if path_pattern.anchor:
|
||||||
pass
|
pattern_str = str(path_pattern)
|
||||||
elif path_pattern.parts:
|
elif path_pattern.parts:
|
||||||
pattern_str = f'**{sep}{pattern_str}'
|
pattern_str = str('**' / path_pattern)
|
||||||
else:
|
else:
|
||||||
raise ValueError("empty pattern")
|
raise ValueError("empty pattern")
|
||||||
match = _compile_pattern(pattern_str, sep, case_sensitive)
|
match = _compile_pattern(pattern_str, sep, case_sensitive)
|
||||||
|
@ -714,10 +727,8 @@ class PathBase(PurePathBase):
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
return nullcontext(self.iterdir())
|
return nullcontext(self.iterdir())
|
||||||
|
|
||||||
def _make_child_entry(self, entry, is_dir=False):
|
def _make_child_entry(self, entry):
|
||||||
# Transform an entry yielded from _scandir() into a path object.
|
# Transform an entry yielded from _scandir() into a path object.
|
||||||
if is_dir:
|
|
||||||
return entry.joinpath('')
|
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
def _make_child_relpath(self, name):
|
def _make_child_relpath(self, name):
|
||||||
|
@ -727,57 +738,35 @@ class PathBase(PurePathBase):
|
||||||
"""Iterate over this subtree and yield all existing files (of any
|
"""Iterate over this subtree and yield all existing files (of any
|
||||||
kind, including directories) matching the given relative pattern.
|
kind, including directories) matching the given relative pattern.
|
||||||
"""
|
"""
|
||||||
path_pattern = self.with_segments(pattern)
|
if not isinstance(pattern, PurePathBase):
|
||||||
if path_pattern.anchor:
|
pattern = self.with_segments(pattern)
|
||||||
raise NotImplementedError("Non-relative patterns are unsupported")
|
|
||||||
elif not path_pattern.parts:
|
|
||||||
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
|
|
||||||
|
|
||||||
pattern_parts = list(path_pattern.parts)
|
|
||||||
if not self.pathmod.split(pattern)[1]:
|
|
||||||
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
|
|
||||||
pattern_parts.append('')
|
|
||||||
|
|
||||||
if case_sensitive is None:
|
if case_sensitive is None:
|
||||||
# TODO: evaluate case-sensitivity of each directory in _select_children().
|
# TODO: evaluate case-sensitivity of each directory in _select_children().
|
||||||
case_sensitive = _is_case_sensitive(self.pathmod)
|
case_sensitive = _is_case_sensitive(self.pathmod)
|
||||||
|
|
||||||
# If symlinks are handled consistently, and the pattern does not
|
stack = pattern._pattern_stack
|
||||||
# contain '..' components, then we can use a 'walk-and-match' strategy
|
specials = ('', '.', '..')
|
||||||
# when expanding '**' wildcards. When a '**' wildcard is encountered,
|
filter_paths = False
|
||||||
# all following pattern parts are immediately consumed and used to
|
|
||||||
# build a `re.Pattern` object. This pattern is used to filter the
|
|
||||||
# recursive walk. As a result, pattern parts following a '**' wildcard
|
|
||||||
# do not perform any filesystem access, which can be much faster!
|
|
||||||
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
|
|
||||||
deduplicate_paths = False
|
deduplicate_paths = False
|
||||||
sep = self.pathmod.sep
|
sep = self.pathmod.sep
|
||||||
paths = iter([self.joinpath('')] if self.is_dir() else [])
|
paths = iter([self.joinpath('')] if self.is_dir() else [])
|
||||||
part_idx = 0
|
while stack:
|
||||||
while part_idx < len(pattern_parts):
|
part = stack.pop()
|
||||||
part = pattern_parts[part_idx]
|
if part in specials:
|
||||||
part_idx += 1
|
paths = _select_special(paths, part)
|
||||||
if part == '':
|
|
||||||
# Trailing slash.
|
|
||||||
pass
|
|
||||||
elif part == '..':
|
|
||||||
paths = (path._make_child_relpath('..') for path in paths)
|
|
||||||
elif part == '**':
|
elif part == '**':
|
||||||
# Consume adjacent '**' components.
|
# Consume adjacent '**' components.
|
||||||
while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
|
while stack and stack[-1] == '**':
|
||||||
part_idx += 1
|
stack.pop()
|
||||||
|
|
||||||
if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
|
# Consume adjacent non-special components and enable post-walk
|
||||||
dir_only = pattern_parts[-1] == ''
|
# regex filtering, provided we're treating symlinks consistently.
|
||||||
paths = _select_recursive(paths, dir_only, follow_symlinks)
|
if follow_symlinks is not None:
|
||||||
|
while stack and stack[-1] not in specials:
|
||||||
|
filter_paths = True
|
||||||
|
stack.pop()
|
||||||
|
|
||||||
# Filter out paths that don't match pattern.
|
dir_only = bool(stack)
|
||||||
prefix_len = len(str(self._make_child_relpath('_'))) - 1
|
|
||||||
match = _compile_pattern(str(path_pattern), sep, case_sensitive)
|
|
||||||
paths = (path for path in paths if match(str(path), prefix_len))
|
|
||||||
return paths
|
|
||||||
|
|
||||||
dir_only = part_idx < len(pattern_parts)
|
|
||||||
paths = _select_recursive(paths, dir_only, follow_symlinks)
|
paths = _select_recursive(paths, dir_only, follow_symlinks)
|
||||||
if deduplicate_paths:
|
if deduplicate_paths:
|
||||||
# De-duplicate if we've already seen a '**' component.
|
# De-duplicate if we've already seen a '**' component.
|
||||||
|
@ -786,9 +775,14 @@ class PathBase(PurePathBase):
|
||||||
elif '**' in part:
|
elif '**' in part:
|
||||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||||
else:
|
else:
|
||||||
dir_only = part_idx < len(pattern_parts)
|
dir_only = bool(stack)
|
||||||
match = _compile_pattern(part, sep, case_sensitive)
|
match = _compile_pattern(part, sep, case_sensitive)
|
||||||
paths = _select_children(paths, dir_only, follow_symlinks, match)
|
paths = _select_children(paths, dir_only, follow_symlinks, match)
|
||||||
|
if filter_paths:
|
||||||
|
# Filter out paths that don't match pattern.
|
||||||
|
prefix_len = len(str(self._make_child_relpath('_'))) - 1
|
||||||
|
match = _compile_pattern(str(pattern), sep, case_sensitive)
|
||||||
|
paths = (path for path in paths if match(str(path), prefix_len))
|
||||||
return paths
|
return paths
|
||||||
|
|
||||||
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
|
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
|
||||||
|
@ -796,8 +790,10 @@ class PathBase(PurePathBase):
|
||||||
directories) matching the given relative pattern, anywhere in
|
directories) matching the given relative pattern, anywhere in
|
||||||
this subtree.
|
this subtree.
|
||||||
"""
|
"""
|
||||||
return self.glob(
|
if not isinstance(pattern, PurePathBase):
|
||||||
f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
|
pattern = self.with_segments(pattern)
|
||||||
|
pattern = '**' / pattern
|
||||||
|
return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
|
||||||
|
|
||||||
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
|
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
|
||||||
"""Walk the directory tree from this directory, similar to os.walk()."""
|
"""Walk the directory tree from this directory, similar to os.walk()."""
|
||||||
|
|
|
@ -1818,6 +1818,13 @@ class PathTest(test_pathlib_abc.DummyPathTest, PurePathTest):
|
||||||
list(base.walk())
|
list(base.walk())
|
||||||
list(base.walk(top_down=False))
|
list(base.walk(top_down=False))
|
||||||
|
|
||||||
|
def test_glob_empty_pattern(self):
|
||||||
|
p = self.cls('')
|
||||||
|
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
|
||||||
|
list(p.glob(''))
|
||||||
|
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
|
||||||
|
list(p.glob('.'))
|
||||||
|
|
||||||
def test_glob_many_open_files(self):
|
def test_glob_many_open_files(self):
|
||||||
depth = 30
|
depth = 30
|
||||||
P = self.cls
|
P = self.cls
|
||||||
|
@ -1860,6 +1867,22 @@ class PathTest(test_pathlib_abc.DummyPathTest, PurePathTest):
|
||||||
with self.assertWarns(FutureWarning):
|
with self.assertWarns(FutureWarning):
|
||||||
p.rglob('*/**')
|
p.rglob('*/**')
|
||||||
|
|
||||||
|
def test_glob_pathlike(self):
|
||||||
|
P = self.cls
|
||||||
|
p = P(self.base)
|
||||||
|
pattern = "dir*/file*"
|
||||||
|
expect = {p / "dirB/fileB", p / "dirC/fileC"}
|
||||||
|
self.assertEqual(expect, set(p.glob(P(pattern))))
|
||||||
|
self.assertEqual(expect, set(p.glob(FakePath(pattern))))
|
||||||
|
|
||||||
|
def test_rglob_pathlike(self):
|
||||||
|
P = self.cls
|
||||||
|
p = P(self.base, "dirC")
|
||||||
|
pattern = "**/file*"
|
||||||
|
expect = {p / "fileC", p / "dirD/fileD"}
|
||||||
|
self.assertEqual(expect, set(p.rglob(P(pattern))))
|
||||||
|
self.assertEqual(expect, set(p.rglob(FakePath(pattern))))
|
||||||
|
|
||||||
|
|
||||||
@only_posix
|
@only_posix
|
||||||
class PosixPathTest(PathTest, PurePosixPathTest):
|
class PosixPathTest(PathTest, PurePosixPathTest):
|
||||||
|
|
|
@ -1045,9 +1045,12 @@ class DummyPathTest(DummyPurePathTest):
|
||||||
_check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"])
|
_check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"])
|
||||||
|
|
||||||
def test_glob_empty_pattern(self):
|
def test_glob_empty_pattern(self):
|
||||||
p = self.cls('')
|
def _check(glob, expected):
|
||||||
with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
|
self.assertEqual(set(glob), { P(self.base, q) for q in expected })
|
||||||
list(p.glob(''))
|
P = self.cls
|
||||||
|
p = P(self.base)
|
||||||
|
_check(p.glob(""), [""])
|
||||||
|
_check(p.glob("."), ["."])
|
||||||
|
|
||||||
def test_glob_case_sensitive(self):
|
def test_glob_case_sensitive(self):
|
||||||
P = self.cls
|
P = self.cls
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Accept :term:`path-like objects <path-like object>` as patterns in
|
||||||
|
:meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`.
|
Loading…
Add table
Add a link
Reference in a new issue