mirror of
https://github.com/python/cpython.git
synced 2025-07-19 09:15:34 +00:00
GH-102613: Improve performance of pathlib.Path.rglob()
(GH-104244)
Stop de-duplicating results in `_RecursiveWildcardSelector`. A new `_DoubleRecursiveWildcardSelector` class is introduced which performs de-duplication, but this is used _only_ for patterns with multiple non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding the use of a set, `PurePath.__hash__()` is not called, and so paths do not need to be stringified and case-normalised. Also merge adjacent '**' segments in patterns.
This commit is contained in:
parent
8d95012c95
commit
c0ece3dc97
3 changed files with 45 additions and 18 deletions
|
@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
|
|||
@functools.lru_cache()
|
||||
def _make_selector(pattern_parts, flavour, case_sensitive):
|
||||
pat = pattern_parts[0]
|
||||
child_parts = pattern_parts[1:]
|
||||
if not pat:
|
||||
return _TerminatingSelector()
|
||||
if pat == '**':
|
||||
cls = _RecursiveWildcardSelector
|
||||
elif pat == '..':
|
||||
cls = _ParentSelector
|
||||
elif '**' in pat:
|
||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||
child_parts_idx = 1
|
||||
while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
|
||||
child_parts_idx += 1
|
||||
child_parts = pattern_parts[child_parts_idx:]
|
||||
if '**' in child_parts:
|
||||
cls = _DoubleRecursiveWildcardSelector
|
||||
else:
|
||||
cls = _RecursiveWildcardSelector
|
||||
else:
|
||||
cls = _WildcardSelector
|
||||
child_parts = pattern_parts[1:]
|
||||
if pat == '..':
|
||||
cls = _ParentSelector
|
||||
elif '**' in pat:
|
||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||
else:
|
||||
cls = _WildcardSelector
|
||||
return cls(pat, child_parts, flavour, case_sensitive)
|
||||
|
||||
|
||||
|
@ -183,20 +191,32 @@ class _RecursiveWildcardSelector(_Selector):
|
|||
|
||||
def _select_from(self, parent_path, scandir):
|
||||
try:
|
||||
yielded = set()
|
||||
try:
|
||||
successor_select = self.successor._select_from
|
||||
for starting_point in self._iterate_directories(parent_path, scandir):
|
||||
for p in successor_select(starting_point, scandir):
|
||||
if p not in yielded:
|
||||
yield p
|
||||
yielded.add(p)
|
||||
finally:
|
||||
yielded.clear()
|
||||
successor_select = self.successor._select_from
|
||||
for starting_point in self._iterate_directories(parent_path, scandir):
|
||||
for p in successor_select(starting_point, scandir):
|
||||
yield p
|
||||
except PermissionError:
|
||||
return
|
||||
|
||||
|
||||
class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
|
||||
"""
|
||||
Like _RecursiveWildcardSelector, but also de-duplicates results from
|
||||
successive selectors. This is necessary if the pattern contains
|
||||
multiple non-adjacent '**' segments.
|
||||
"""
|
||||
|
||||
def _select_from(self, parent_path, scandir):
|
||||
yielded = set()
|
||||
try:
|
||||
for p in super()._select_from(parent_path, scandir):
|
||||
if p not in yielded:
|
||||
yield p
|
||||
yielded.add(p)
|
||||
finally:
|
||||
yielded.clear()
|
||||
|
||||
|
||||
#
|
||||
# Public API
|
||||
#
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue