GH-102613: Improve performance of pathlib.Path.rglob() (GH-104244)

Stop de-duplicating results in `_RecursiveWildcardSelector`. A new
`_DoubleRecursiveWildcardSelector` class is introduced which performs
de-duplication, but this is used _only_ for patterns with multiple
non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding
the use of a set, `PurePath.__hash__()` is not called, and so paths do not
need to be stringified and case-normalised.

Also merge adjacent '**' segments in patterns.
This commit is contained in:
Barney Gale 2023-05-07 22:12:50 +01:00 committed by GitHub
parent 8d95012c95
commit c0ece3dc97
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 18 deletions

View file

@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
@functools.lru_cache() @functools.lru_cache()
def _make_selector(pattern_parts, flavour, case_sensitive): def _make_selector(pattern_parts, flavour, case_sensitive):
pat = pattern_parts[0] pat = pattern_parts[0]
child_parts = pattern_parts[1:]
if not pat: if not pat:
return _TerminatingSelector() return _TerminatingSelector()
if pat == '**': if pat == '**':
cls = _RecursiveWildcardSelector child_parts_idx = 1
elif pat == '..': while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
cls = _ParentSelector child_parts_idx += 1
elif '**' in pat: child_parts = pattern_parts[child_parts_idx:]
raise ValueError("Invalid pattern: '**' can only be an entire path component") if '**' in child_parts:
cls = _DoubleRecursiveWildcardSelector
else:
cls = _RecursiveWildcardSelector
else: else:
cls = _WildcardSelector child_parts = pattern_parts[1:]
if pat == '..':
cls = _ParentSelector
elif '**' in pat:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
cls = _WildcardSelector
return cls(pat, child_parts, flavour, case_sensitive) return cls(pat, child_parts, flavour, case_sensitive)
@ -183,20 +191,32 @@ class _RecursiveWildcardSelector(_Selector):
def _select_from(self, parent_path, scandir): def _select_from(self, parent_path, scandir):
try: try:
yielded = set() successor_select = self.successor._select_from
try: for starting_point in self._iterate_directories(parent_path, scandir):
successor_select = self.successor._select_from for p in successor_select(starting_point, scandir):
for starting_point in self._iterate_directories(parent_path, scandir): yield p
for p in successor_select(starting_point, scandir):
if p not in yielded:
yield p
yielded.add(p)
finally:
yielded.clear()
except PermissionError: except PermissionError:
return return
class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
"""
Like _RecursiveWildcardSelector, but also de-duplicates results from
successive selectors. This is necessary if the pattern contains
multiple non-adjacent '**' segments.
"""
def _select_from(self, parent_path, scandir):
yielded = set()
try:
for p in super()._select_from(parent_path, scandir):
if p not in yielded:
yield p
yielded.add(p)
finally:
yielded.clear()
# #
# Public API # Public API
# #

View file

@ -1853,13 +1853,14 @@ class _BasePathTest(object):
def test_rglob_common(self): def test_rglob_common(self):
def _check(glob, expected): def _check(glob, expected):
self.assertEqual(set(glob), { P(BASE, q) for q in expected }) self.assertEqual(sorted(glob), sorted(P(BASE, q) for q in expected))
P = self.cls P = self.cls
p = P(BASE) p = P(BASE)
it = p.rglob("fileA") it = p.rglob("fileA")
self.assertIsInstance(it, collections.abc.Iterator) self.assertIsInstance(it, collections.abc.Iterator)
_check(it, ["fileA"]) _check(it, ["fileA"])
_check(p.rglob("fileB"), ["dirB/fileB"]) _check(p.rglob("fileB"), ["dirB/fileB"])
_check(p.rglob("**/fileB"), ["dirB/fileB"])
_check(p.rglob("*/fileA"), []) _check(p.rglob("*/fileA"), [])
if not os_helper.can_symlink(): if not os_helper.can_symlink():
_check(p.rglob("*/fileB"), ["dirB/fileB"]) _check(p.rglob("*/fileB"), ["dirB/fileB"])
@ -1883,9 +1884,12 @@ class _BasePathTest(object):
_check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt", _check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
"dirC/dirD", "dirC/dirD/fileD"]) "dirC/dirD", "dirC/dirD/fileD"])
_check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"]) _check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
_check(p.rglob("**/file*"), ["dirC/fileC", "dirC/dirD/fileD"])
_check(p.rglob("dir*/**"), ["dirC/dirD"])
_check(p.rglob("*/*"), ["dirC/dirD/fileD"]) _check(p.rglob("*/*"), ["dirC/dirD/fileD"])
_check(p.rglob("*/"), ["dirC/dirD"]) _check(p.rglob("*/"), ["dirC/dirD"])
_check(p.rglob(""), ["dirC", "dirC/dirD"]) _check(p.rglob(""), ["dirC", "dirC/dirD"])
_check(p.rglob("**"), ["dirC", "dirC/dirD"])
# gh-91616, a re module regression # gh-91616, a re module regression
_check(p.rglob("*.txt"), ["dirC/novel.txt"]) _check(p.rglob("*.txt"), ["dirC/novel.txt"])
_check(p.rglob("*.*"), ["dirC/novel.txt"]) _check(p.rglob("*.*"), ["dirC/novel.txt"])

View file

@ -0,0 +1,3 @@
Improve performance of :meth:`pathlib.Path.glob` when expanding recursive
wildcards ("``**``") by merging adjacent wildcards and de-duplicating
results only when necessary.