GH-117586: Speed up pathlib.Path.glob() by working with strings (#117589)

Move pathlib globbing implementation into a new private class: `glob._Globber`. This class implements fast string-based globbing. It's called by `pathlib.Path.glob()`, which then converts strings back to path objects.

In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that works with `PathBase` objects rather than strings, and calls user-defined path methods like `PathBase.stat()` rather than `os.stat()`.

This sets the stage for two more improvements:

- GH-115060: Query non-wildcard segments with `lstat()`
- GH-116380: Unify `pathlib` and `glob` implementations of globbing.

No change to the implementations of `glob.glob()` and `glob.iglob()`.
This commit is contained in:
Barney Gale 2024-04-10 20:43:07 +01:00 committed by GitHub
parent 689ada7915
commit 6258844c27
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 269 additions and 195 deletions

View file

@ -4,7 +4,9 @@ import contextlib
import os import os
import re import re
import fnmatch import fnmatch
import functools
import itertools import itertools
import operator
import stat import stat
import sys import sys
@ -256,7 +258,9 @@ def escape(pathname):
return drive + pathname return drive + pathname
_special_parts = ('', '.', '..')
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
_no_recurse_symlinks = object()
def translate(pat, *, recursive=False, include_hidden=False, seps=None): def translate(pat, *, recursive=False, include_hidden=False, seps=None):
@ -312,3 +316,185 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
results.append(any_sep) results.append(any_sep)
res = ''.join(results) res = ''.join(results)
return fr'(?s:{res})\Z' return fr'(?s:{res})\Z'
@functools.lru_cache(maxsize=512)
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity)."""
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
return re.compile(regex, flags=flags).match
class _Globber:
"""Class providing shell-style pattern matching and globbing.
"""
def __init__(self, sep, case_sensitive, recursive=False):
self.sep = sep
self.case_sensitive = case_sensitive
self.recursive = recursive
# Low-level methods
lstat = staticmethod(os.lstat)
scandir = staticmethod(os.scandir)
parse_entry = operator.attrgetter('path')
concat_path = operator.add
if os.name == 'nt':
@staticmethod
def add_slash(pathname):
tail = os.path.splitroot(pathname)[2]
if not tail or tail[-1] in '\\/':
return pathname
return f'{pathname}\\'
else:
@staticmethod
def add_slash(pathname):
if not pathname or pathname[-1] == '/':
return pathname
return f'{pathname}/'
# High-level methods
def compile(self, pat):
return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive)
def selector(self, parts):
"""Returns a function that selects from a given path, walking and
filtering according to the glob-style pattern parts in *parts*.
"""
if not parts:
return self.select_exists
part = parts.pop()
if self.recursive and part == '**':
selector = self.recursive_selector
elif part in _special_parts:
selector = self.special_selector
else:
selector = self.wildcard_selector
return selector(part, parts)
def special_selector(self, part, parts):
"""Returns a function that selects special children of the given path.
"""
select_next = self.selector(parts)
def select_special(path, exists=False):
path = self.concat_path(self.add_slash(path), part)
return select_next(path, exists)
return select_special
def wildcard_selector(self, part, parts):
"""Returns a function that selects direct children of a given path,
filtering by pattern.
"""
match = None if part == '*' else self.compile(part)
dir_only = bool(parts)
if dir_only:
select_next = self.selector(parts)
def select_wildcard(path, exists=False):
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with self.scandir(path) as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
if match is None or match(entry.name):
if dir_only:
try:
if not entry.is_dir():
continue
except OSError:
continue
entry_path = self.parse_entry(entry)
if dir_only:
yield from select_next(entry_path, exists=True)
else:
yield entry_path
return select_wildcard
def recursive_selector(self, part, parts):
"""Returns a function that selects a given path and all its children,
recursively, filtering by pattern.
"""
# Optimization: consume following '**' parts, which have no effect.
while parts and parts[-1] == '**':
parts.pop()
# Optimization: consume and join any following non-special parts here,
# rather than leaving them for the next selector. They're used to
# build a regular expression, which we use to filter the results of
# the recursive walk. As a result, non-special pattern segments
# following a '**' wildcard don't require additional filesystem access
# to expand.
follow_symlinks = self.recursive is not _no_recurse_symlinks
if follow_symlinks:
while parts and parts[-1] not in _special_parts:
part += self.sep + parts.pop()
match = None if part == '**' else self.compile(part)
dir_only = bool(parts)
select_next = self.selector(parts)
def select_recursive(path, exists=False):
path = self.add_slash(path)
match_pos = len(str(path))
if match is None or match(str(path), match_pos):
yield from select_next(path, exists)
stack = [path]
while stack:
yield from select_recursive_step(stack, match_pos)
def select_recursive_step(stack, match_pos):
path = stack.pop()
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with self.scandir(path) as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
is_dir = False
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
is_dir = True
except OSError:
pass
if is_dir or not dir_only:
entry_path = self.parse_entry(entry)
if match is None or match(str(entry_path), match_pos):
if dir_only:
yield from select_next(entry_path, exists=True)
else:
# Optimization: directly yield the path if this is
# last pattern part.
yield entry_path
if is_dir:
stack.append(entry_path)
return select_recursive
def select_exists(self, path, exists=False):
"""Yields the given path, if it exists.
"""
if exists:
# Optimization: this path is already known to exist, e.g. because
# it was returned from os.scandir(), so we skip calling lstat().
yield path
else:
try:
self.lstat(path)
yield path
except OSError:
pass

View file

@ -5,8 +5,10 @@ paths with operations that have semantics appropriate for different
operating systems. operating systems.
""" """
import glob
import io import io
import ntpath import ntpath
import operator
import os import os
import posixpath import posixpath
import sys import sys
@ -111,6 +113,7 @@ class PurePath(_abc.PurePathBase):
'_hash', '_hash',
) )
parser = os.path parser = os.path
_globber = glob._Globber
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
"""Construct a PurePath from one or several strings and or existing """Construct a PurePath from one or several strings and or existing
@ -253,14 +256,17 @@ class PurePath(_abc.PurePathBase):
return cls.parser.sep.join(tail) return cls.parser.sep.join(tail)
def _from_parsed_parts(self, drv, root, tail): def _from_parsed_parts(self, drv, root, tail):
path_str = self._format_parsed_parts(drv, root, tail) path = self._from_parsed_string(self._format_parsed_parts(drv, root, tail))
path = self.with_segments(path_str)
path._str = path_str or '.'
path._drv = drv path._drv = drv
path._root = root path._root = root
path._tail_cached = tail path._tail_cached = tail
return path return path
def _from_parsed_string(self, path_str):
path = self.with_segments(path_str)
path._str = path_str or '.'
return path
@classmethod @classmethod
def _parse_path(cls, path): def _parse_path(cls, path):
if not path: if not path:
@ -453,21 +459,6 @@ class PurePath(_abc.PurePathBase):
from urllib.parse import quote_from_bytes from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path)) return prefix + quote_from_bytes(os.fsencode(path))
@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
parts = self._tail.copy()
pattern = self._raw_path
if self.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
elif pattern[-1] in (self.parser.sep, self.parser.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
parts.reverse()
return parts
@property @property
def _pattern_str(self): def _pattern_str(self):
"""The path expressed as a string, for use in pattern-matching.""" """The path expressed as a string, for use in pattern-matching."""
@ -576,6 +567,17 @@ class Path(_abc.PathBase, PurePath):
encoding = io.text_encoding(encoding) encoding = io.text_encoding(encoding)
return _abc.PathBase.write_text(self, data, encoding, errors, newline) return _abc.PathBase.write_text(self, data, encoding, errors, newline)
_remove_leading_dot = operator.itemgetter(slice(2, None))
_remove_trailing_slash = operator.itemgetter(slice(-1))
def _filter_trailing_slash(self, paths):
sep = self.parser.sep
anchor_len = len(self.anchor)
for path_str in paths:
if len(path_str) > anchor_len and path_str[-1] == sep:
path_str = path_str[:-1]
yield path_str
def iterdir(self): def iterdir(self):
"""Yield path objects of the directory contents. """Yield path objects of the directory contents.
@ -587,13 +589,9 @@ class Path(_abc.PathBase, PurePath):
def _scandir(self): def _scandir(self):
return os.scandir(self) return os.scandir(self)
def _direntry_str(self, entry):
# Transform an entry yielded from _scandir() into a path string.
return entry.name if str(self) == '.' else entry.path
def _make_child_direntry(self, entry): def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object. # Transform an entry yielded from _scandir() into a path object.
path_str = self._direntry_str(entry) path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str) path = self.with_segments(path_str)
path._str = path_str path._str = path_str
path._drv = self.drive path._drv = self.drive
@ -626,8 +624,30 @@ class Path(_abc.PathBase, PurePath):
sys.audit("pathlib.Path.glob", self, pattern) sys.audit("pathlib.Path.glob", self, pattern)
if not isinstance(pattern, PurePath): if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern) pattern = self.with_segments(pattern)
return _abc.PathBase.glob( if pattern.anchor:
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks) raise NotImplementedError("Non-relative patterns are unsupported")
parts = pattern._tail.copy()
if not parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
raw = pattern._raw_path
if raw[-1] in (self.parser.sep, self.parser.altsep):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
parts.append('')
if not self.is_dir():
return iter([])
select = self._glob_selector(parts[::-1], case_sensitive, recurse_symlinks)
root = str(self)
paths = select(root, exists=True)
# Normalize results
if root == '.':
paths = map(self._remove_leading_dot, paths)
if parts[-1] == '':
paths = map(self._remove_trailing_slash, paths)
elif parts[-1] == '**':
paths = self._filter_trailing_slash(paths)
paths = map(self._from_parsed_string, paths)
return paths
def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
"""Recursively yield all existing files (of any kind, including """Recursively yield all existing files (of any kind, including
@ -638,8 +658,7 @@ class Path(_abc.PathBase, PurePath):
if not isinstance(pattern, PurePath): if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern) pattern = self.with_segments(pattern)
pattern = '**' / pattern pattern = '**' / pattern
return _abc.PathBase.glob( return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
def walk(self, top_down=True, on_error=None, follow_symlinks=False): def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk().""" """Walk the directory tree from this directory, similar to os.walk()."""
@ -669,9 +688,7 @@ class Path(_abc.PathBase, PurePath):
# of joining, and we exploit the fact that getcwd() returns a # of joining, and we exploit the fact that getcwd() returns a
# fully-normalized string by storing it in _str. This is used to # fully-normalized string by storing it in _str. This is used to
# implement Path.cwd(). # implement Path.cwd().
result = self.with_segments(cwd) return self._from_parsed_string(cwd)
result._str = cwd
return result
drive, root, rel = os.path.splitroot(cwd) drive, root, rel = os.path.splitroot(cwd)
if not rel: if not rel:
return self._from_parsed_parts(drive, root, self._tail) return self._from_parsed_parts(drive, root, self._tail)

View file

@ -12,6 +12,8 @@ resemble pathlib's PurePath and Path respectively.
""" """
import functools import functools
import glob
import operator
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO
@ -40,109 +42,23 @@ def _ignore_error(exception):
def _is_case_sensitive(parser): def _is_case_sensitive(parser):
return parser.normcase('Aa') == 'Aa' return parser.normcase('Aa') == 'Aa'
#
# Globbing helpers
#
re = glob = None class Globber(glob._Globber):
lstat = operator.methodcaller('lstat')
scandir = operator.methodcaller('_scandir')
add_slash = operator.methodcaller('joinpath', '')
@staticmethod
def concat_path(path, text):
"""Appends text to the given path.
"""
return path.with_segments(path._raw_path + text)
@functools.lru_cache(maxsize=512) @staticmethod
def _compile_pattern(pat, sep, case_sensitive, recursive=True): def parse_entry(entry):
"""Compile given glob pattern to a re.Pattern object (observing case """Returns the path of an entry yielded from scandir().
sensitivity).""" """
global re, glob return entry
if re is None:
import re, glob
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep)
return re.compile(regex, flags=flags).match
def _select_special(paths, part):
"""Yield special literal children of the given paths."""
for path in paths:
yield path._make_child_relpath(part)
def _select_children(parent_paths, dir_only, match):
"""Yield direct children of given paths, filtering by name and type."""
for parent_path in parent_paths:
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with parent_path._scandir() as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
if dir_only:
try:
if not entry.is_dir():
continue
except OSError:
continue
# Avoid cost of making a path object for non-matching paths by
# matching against the os.DirEntry.name string.
if match is None or match(entry.name):
yield parent_path._make_child_direntry(entry)
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
"""Yield given paths and all their children, recursively, filtering by
string and type.
"""
for parent_path in parent_paths:
if match is not None:
# If we're filtering paths through a regex, record the length of
# the parent path. We'll pass it to match(path, pos=...) later.
parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
paths = [parent_path._make_child_relpath('')]
while paths:
path = paths.pop()
if match is None or match(str(path), parent_len):
# Yield *directory* path that matches pattern (if any).
yield path
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
with path._scandir() as scandir_it:
entries = list(scandir_it)
except OSError:
pass
else:
for entry in entries:
# Handle directory entry.
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
# Recurse into this directory.
paths.append(path._make_child_direntry(entry))
continue
except OSError:
pass
# Handle file entry.
if not dir_only:
# Avoid cost of making a path object for non-matching
# files by matching against the os.DirEntry object.
if match is None or match(path._direntry_str(entry), parent_len):
# Yield *file* path that matches pattern (if any).
yield path._make_child_direntry(entry)
def _select_unique(paths):
"""Yields the given paths, filtering out duplicates."""
yielded = set()
try:
for path in paths:
path_str = str(path)
if path_str not in yielded:
yield path
yielded.add(path_str)
finally:
yielded.clear()
class UnsupportedOperation(NotImplementedError): class UnsupportedOperation(NotImplementedError):
@ -218,6 +134,7 @@ class PurePathBase:
'_resolving', '_resolving',
) )
parser = ParserBase() parser = ParserBase()
_globber = Globber
def __init__(self, path, *paths): def __init__(self, path, *paths):
self._raw_path = self.parser.join(path, *paths) if paths else path self._raw_path = self.parser.join(path, *paths) if paths else path
@ -454,14 +371,6 @@ class PurePathBase:
a drive).""" a drive)."""
return self.parser.isabs(self._raw_path) return self.parser.isabs(self._raw_path)
@property
def _pattern_stack(self):
"""Stack of path components, to be used with patterns in glob()."""
anchor, parts = self._stack
if anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
return parts
@property @property
def _pattern_str(self): def _pattern_str(self):
"""The path expressed as a string, for use in pattern-matching.""" """The path expressed as a string, for use in pattern-matching."""
@ -487,8 +396,9 @@ class PurePathBase:
return False return False
if len(path_parts) > len(pattern_parts) and path_pattern.anchor: if len(path_parts) > len(pattern_parts) and path_pattern.anchor:
return False return False
globber = self._globber(sep, case_sensitive)
for path_part, pattern_part in zip(path_parts, pattern_parts): for path_part, pattern_part in zip(path_parts, pattern_parts):
match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False) match = globber.compile(pattern_part)
if match(path_part) is None: if match(path_part) is None:
return False return False
return True return True
@ -502,7 +412,8 @@ class PurePathBase:
pattern = self.with_segments(pattern) pattern = self.with_segments(pattern)
if case_sensitive is None: if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser) case_sensitive = _is_case_sensitive(self.parser)
match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive) globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True)
match = globber.compile(pattern._pattern_str)
return match(self._pattern_str) is not None return match(self._pattern_str) is not None
@ -772,11 +683,6 @@ class PathBase(PurePathBase):
from contextlib import nullcontext from contextlib import nullcontext
return nullcontext(self.iterdir()) return nullcontext(self.iterdir())
def _direntry_str(self, entry):
# Transform an entry yielded from _scandir() into a path string.
# PathBase._scandir() yields PathBase objects, so use str().
return str(entry)
def _make_child_direntry(self, entry): def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object. # Transform an entry yielded from _scandir() into a path object.
# PathBase._scandir() yields PathBase objects, so this is a no-op. # PathBase._scandir() yields PathBase objects, so this is a no-op.
@ -785,62 +691,26 @@ class PathBase(PurePathBase):
def _make_child_relpath(self, name): def _make_child_relpath(self, name):
return self.joinpath(name) return self.joinpath(name)
def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
recursive = True if recurse_symlinks else glob._no_recurse_symlinks
globber = self._globber(self.parser.sep, case_sensitive, recursive)
return globber.selector(parts)
def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
"""Iterate over this subtree and yield all existing files (of any """Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern. kind, including directories) matching the given relative pattern.
""" """
if not isinstance(pattern, PurePathBase): if not isinstance(pattern, PurePathBase):
pattern = self.with_segments(pattern) pattern = self.with_segments(pattern)
if case_sensitive is None: anchor, parts = pattern._stack
# TODO: evaluate case-sensitivity of each directory in _select_children(). if anchor:
case_sensitive = _is_case_sensitive(self.parser) raise NotImplementedError("Non-relative patterns are unsupported")
if not self.is_dir():
stack = pattern._pattern_stack return iter([])
specials = ('', '.', '..') select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
deduplicate_paths = False return select(self, exists=True)
sep = self.parser.sep
paths = iter([self] if self.is_dir() else [])
while stack:
part = stack.pop()
if part in specials:
# Join special component (e.g. '..') onto paths.
paths = _select_special(paths, part)
elif part == '**':
# Consume following '**' components, which have no effect.
while stack and stack[-1] == '**':
stack.pop()
# Consume following non-special components, provided we're
# treating symlinks consistently. Each component is joined
# onto 'part', which is used to generate an re.Pattern object.
if recurse_symlinks:
while stack and stack[-1] not in specials:
part += sep + stack.pop()
# If the previous loop consumed pattern components, compile an
# re.Pattern object based on those components.
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
# Recursively walk directories, filtering by type and regex.
paths = _select_recursive(paths, bool(stack), recurse_symlinks, match)
# De-duplicate if we've already seen a '**' component.
if deduplicate_paths:
paths = _select_unique(paths)
deduplicate_paths = True
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
# If the pattern component isn't '*', compile an re.Pattern
# object based on the component.
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
# Iterate over directories' children filtering by type and regex.
paths = _select_children(paths, bool(stack), match)
return paths
def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
"""Recursively yield all existing files (of any kind, including """Recursively yield all existing files (of any kind, including

View file

@ -0,0 +1 @@
Speed up :meth:`pathlib.Path.glob` by working with strings internally.