mirror of
https://github.com/python/cpython.git
synced 2025-07-24 11:44:31 +00:00
GH-102613: Improve performance of pathlib.Path.rglob()
(GH-104244)
Stop de-duplicating results in `_RecursiveWildcardSelector`. A new `_DoubleRecursiveWildcardSelector` class is introduced which performs de-duplication, but this is used _only_ for patterns with multiple non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding the use of a set, `PurePath.__hash__()` is not called, and so paths do not need to be stringified and case-normalised. Also merge adjacent '**' segments in patterns.
This commit is contained in:
parent
8d95012c95
commit
c0ece3dc97
3 changed files with 45 additions and 18 deletions
|
@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
|
||||||
@functools.lru_cache()
|
@functools.lru_cache()
|
||||||
def _make_selector(pattern_parts, flavour, case_sensitive):
|
def _make_selector(pattern_parts, flavour, case_sensitive):
|
||||||
pat = pattern_parts[0]
|
pat = pattern_parts[0]
|
||||||
child_parts = pattern_parts[1:]
|
|
||||||
if not pat:
|
if not pat:
|
||||||
return _TerminatingSelector()
|
return _TerminatingSelector()
|
||||||
if pat == '**':
|
if pat == '**':
|
||||||
cls = _RecursiveWildcardSelector
|
child_parts_idx = 1
|
||||||
elif pat == '..':
|
while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
|
||||||
cls = _ParentSelector
|
child_parts_idx += 1
|
||||||
elif '**' in pat:
|
child_parts = pattern_parts[child_parts_idx:]
|
||||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
if '**' in child_parts:
|
||||||
|
cls = _DoubleRecursiveWildcardSelector
|
||||||
|
else:
|
||||||
|
cls = _RecursiveWildcardSelector
|
||||||
else:
|
else:
|
||||||
cls = _WildcardSelector
|
child_parts = pattern_parts[1:]
|
||||||
|
if pat == '..':
|
||||||
|
cls = _ParentSelector
|
||||||
|
elif '**' in pat:
|
||||||
|
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||||
|
else:
|
||||||
|
cls = _WildcardSelector
|
||||||
return cls(pat, child_parts, flavour, case_sensitive)
|
return cls(pat, child_parts, flavour, case_sensitive)
|
||||||
|
|
||||||
|
|
||||||
|
@ -183,20 +191,32 @@ class _RecursiveWildcardSelector(_Selector):
|
||||||
|
|
||||||
def _select_from(self, parent_path, scandir):
|
def _select_from(self, parent_path, scandir):
|
||||||
try:
|
try:
|
||||||
yielded = set()
|
successor_select = self.successor._select_from
|
||||||
try:
|
for starting_point in self._iterate_directories(parent_path, scandir):
|
||||||
successor_select = self.successor._select_from
|
for p in successor_select(starting_point, scandir):
|
||||||
for starting_point in self._iterate_directories(parent_path, scandir):
|
yield p
|
||||||
for p in successor_select(starting_point, scandir):
|
|
||||||
if p not in yielded:
|
|
||||||
yield p
|
|
||||||
yielded.add(p)
|
|
||||||
finally:
|
|
||||||
yielded.clear()
|
|
||||||
except PermissionError:
|
except PermissionError:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
|
||||||
|
"""
|
||||||
|
Like _RecursiveWildcardSelector, but also de-duplicates results from
|
||||||
|
successive selectors. This is necessary if the pattern contains
|
||||||
|
multiple non-adjacent '**' segments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _select_from(self, parent_path, scandir):
|
||||||
|
yielded = set()
|
||||||
|
try:
|
||||||
|
for p in super()._select_from(parent_path, scandir):
|
||||||
|
if p not in yielded:
|
||||||
|
yield p
|
||||||
|
yielded.add(p)
|
||||||
|
finally:
|
||||||
|
yielded.clear()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Public API
|
# Public API
|
||||||
#
|
#
|
||||||
|
|
|
@ -1853,13 +1853,14 @@ class _BasePathTest(object):
|
||||||
|
|
||||||
def test_rglob_common(self):
|
def test_rglob_common(self):
|
||||||
def _check(glob, expected):
|
def _check(glob, expected):
|
||||||
self.assertEqual(set(glob), { P(BASE, q) for q in expected })
|
self.assertEqual(sorted(glob), sorted(P(BASE, q) for q in expected))
|
||||||
P = self.cls
|
P = self.cls
|
||||||
p = P(BASE)
|
p = P(BASE)
|
||||||
it = p.rglob("fileA")
|
it = p.rglob("fileA")
|
||||||
self.assertIsInstance(it, collections.abc.Iterator)
|
self.assertIsInstance(it, collections.abc.Iterator)
|
||||||
_check(it, ["fileA"])
|
_check(it, ["fileA"])
|
||||||
_check(p.rglob("fileB"), ["dirB/fileB"])
|
_check(p.rglob("fileB"), ["dirB/fileB"])
|
||||||
|
_check(p.rglob("**/fileB"), ["dirB/fileB"])
|
||||||
_check(p.rglob("*/fileA"), [])
|
_check(p.rglob("*/fileA"), [])
|
||||||
if not os_helper.can_symlink():
|
if not os_helper.can_symlink():
|
||||||
_check(p.rglob("*/fileB"), ["dirB/fileB"])
|
_check(p.rglob("*/fileB"), ["dirB/fileB"])
|
||||||
|
@ -1883,9 +1884,12 @@ class _BasePathTest(object):
|
||||||
_check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
|
_check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
|
||||||
"dirC/dirD", "dirC/dirD/fileD"])
|
"dirC/dirD", "dirC/dirD/fileD"])
|
||||||
_check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
|
_check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
|
||||||
|
_check(p.rglob("**/file*"), ["dirC/fileC", "dirC/dirD/fileD"])
|
||||||
|
_check(p.rglob("dir*/**"), ["dirC/dirD"])
|
||||||
_check(p.rglob("*/*"), ["dirC/dirD/fileD"])
|
_check(p.rglob("*/*"), ["dirC/dirD/fileD"])
|
||||||
_check(p.rglob("*/"), ["dirC/dirD"])
|
_check(p.rglob("*/"), ["dirC/dirD"])
|
||||||
_check(p.rglob(""), ["dirC", "dirC/dirD"])
|
_check(p.rglob(""), ["dirC", "dirC/dirD"])
|
||||||
|
_check(p.rglob("**"), ["dirC", "dirC/dirD"])
|
||||||
# gh-91616, a re module regression
|
# gh-91616, a re module regression
|
||||||
_check(p.rglob("*.txt"), ["dirC/novel.txt"])
|
_check(p.rglob("*.txt"), ["dirC/novel.txt"])
|
||||||
_check(p.rglob("*.*"), ["dirC/novel.txt"])
|
_check(p.rglob("*.*"), ["dirC/novel.txt"])
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
Improve performance of :meth:`pathlib.Path.glob` when expanding recursive
|
||||||
|
wildcards ("``**``") by merging adjacent wildcards and de-duplicating
|
||||||
|
results only when necessary.
|
Loading…
Add table
Add a link
Reference in a new issue