GH-117586: Speed up pathlib.Path.glob() by working with strings (#117589)

Move pathlib globbing implementation into a new private class: `glob._Globber`. This class implements fast string-based globbing. It's called by `pathlib.Path.glob()`, which then converts strings back to path objects. In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that works with `PathBase` objects rather than strings, and calls user-defined path methods like `PathBase.stat()` rather than `os.stat()`. This sets the stage for two more improvements: - GH-115060: Query non-wildcard segments with `lstat()` - GH-116380: Unify `pathlib` and `glob` implementations of globbing. No change to the implementations of `glob.glob()` and `glob.iglob()`.
2025-07-12 22:05:16 +00:00 · 2024-04-10 20:43:07 +01:00 · 2024-04-10 20:43:07 +01:00 · 6258844c27
commit 6258844c27
parent 689ada7915
4 changed files with 269 additions and 195 deletions
--- a/Lib/glob.py
+++ b/Lib/glob.py
@ -4,7 +4,9 @@ import contextlib
 import os
 import re
 import fnmatch
 import functools
 import itertools
 import operator
 import stat
 import sys
@ -256,7 +258,9 @@ def escape(pathname):
    return drive + pathname
 _special_parts = ('', '.', '..')
 _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
 _no_recurse_symlinks = object()
 def translate(pat, *, recursive=False, include_hidden=False, seps=None):
@ -312,3 +316,185 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
                results.append(any_sep)
    res = ''.join(results)
    return fr'(?s:{res})\Z'
@functools.lru_cache(maxsize=512)
 def _compile_pattern(pat, sep, case_sensitive, recursive=True):
    """Compile given glob pattern to a re.Pattern object (observing case
    sensitivity)."""
    flags = re.NOFLAG if case_sensitive else re.IGNORECASE
    regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
    return re.compile(regex, flags=flags).match
 class _Globber:
    """Class providing shell-style pattern matching and globbing.
    """
    def __init__(self,  sep, case_sensitive, recursive=False):
        self.sep = sep
        self.case_sensitive = case_sensitive
        self.recursive = recursive
    # Low-level methods
    lstat = staticmethod(os.lstat)
    scandir = staticmethod(os.scandir)
    parse_entry = operator.attrgetter('path')
    concat_path = operator.add
    if os.name == 'nt':
        @staticmethod
        def add_slash(pathname):
            tail = os.path.splitroot(pathname)[2]
            if not tail or tail[-1] in '\\/':
                return pathname
            return f'{pathname}\\'
    else:
        @staticmethod
        def add_slash(pathname):
            if not pathname or pathname[-1] == '/':
                return pathname
            return f'{pathname}/'
    # High-level methods
    def compile(self, pat):
        return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive)
    def selector(self, parts):
        """Returns a function that selects from a given path, walking and
        filtering according to the glob-style pattern parts in *parts*.
        """
        if not parts:
            return self.select_exists
        part = parts.pop()
        if self.recursive and part == '**':
            selector = self.recursive_selector
        elif part in _special_parts:
            selector = self.special_selector
        else:
            selector = self.wildcard_selector
        return selector(part, parts)
    def special_selector(self, part, parts):
        """Returns a function that selects special children of the given path.
        """
        select_next = self.selector(parts)
        def select_special(path, exists=False):
            path = self.concat_path(self.add_slash(path), part)
            return select_next(path, exists)
        return select_special
    def wildcard_selector(self, part, parts):
        """Returns a function that selects direct children of a given path,
        filtering by pattern.
        """
        match = None if part == '*' else self.compile(part)
        dir_only = bool(parts)
        if dir_only:
            select_next = self.selector(parts)
        def select_wildcard(path, exists=False):
            try:
                # We must close the scandir() object before proceeding to
                # avoid exhausting file descriptors when globbing deep trees.
                with self.scandir(path) as scandir_it:
                    entries = list(scandir_it)
            except OSError:
                pass
            else:
                for entry in entries:
                    if match is None or match(entry.name):
                        if dir_only:
                            try:
                                if not entry.is_dir():
                                    continue
                            except OSError:
                                continue
                        entry_path = self.parse_entry(entry)
                        if dir_only:
                            yield from select_next(entry_path, exists=True)
                        else:
                            yield entry_path
        return select_wildcard
    def recursive_selector(self, part, parts):
        """Returns a function that selects a given path and all its children,
        recursively, filtering by pattern.
        """
        # Optimization: consume following '**' parts, which have no effect.
        while parts and parts[-1] == '**':
            parts.pop()
        # Optimization: consume and join any following non-special parts here,
        # rather than leaving them for the next selector. They're used to
        # build a regular expression, which we use to filter the results of
        # the recursive walk. As a result, non-special pattern segments
        # following a '**' wildcard don't require additional filesystem access
        # to expand.
        follow_symlinks = self.recursive is not _no_recurse_symlinks
        if follow_symlinks:
            while parts and parts[-1] not in _special_parts:
                part += self.sep + parts.pop()
        match = None if part == '**' else self.compile(part)
        dir_only = bool(parts)
        select_next = self.selector(parts)
        def select_recursive(path, exists=False):
            path = self.add_slash(path)
            match_pos = len(str(path))
            if match is None or match(str(path), match_pos):
                yield from select_next(path, exists)
            stack = [path]
            while stack:
                yield from select_recursive_step(stack, match_pos)
        def select_recursive_step(stack, match_pos):
            path = stack.pop()
            try:
                # We must close the scandir() object before proceeding to
                # avoid exhausting file descriptors when globbing deep trees.
                with self.scandir(path) as scandir_it:
                    entries = list(scandir_it)
            except OSError:
                pass
            else:
                for entry in entries:
                    is_dir = False
                    try:
                        if entry.is_dir(follow_symlinks=follow_symlinks):
                            is_dir = True
                    except OSError:
                        pass
                    if is_dir or not dir_only:
                        entry_path = self.parse_entry(entry)
                        if match is None or match(str(entry_path), match_pos):
                            if dir_only:
                                yield from select_next(entry_path, exists=True)
                            else:
                                # Optimization: directly yield the path if this is
                                # last pattern part.
                                yield entry_path
                        if is_dir:
                            stack.append(entry_path)
        return select_recursive
    def select_exists(self, path, exists=False):
        """Yields the given path, if it exists.
        """
        if exists:
            # Optimization: this path is already known to exist, e.g. because
            # it was returned from os.scandir(), so we skip calling lstat().
            yield path
        else:
            try:
                self.lstat(path)
                yield path
            except OSError:
                pass
--- a/Lib/pathlib/init.py
+++ b/Lib/pathlib/init.py
@ -5,8 +5,10 @@ paths with operations that have semantics appropriate for different
 operating systems.
 """
 import glob
 import io
 import ntpath
 import operator
 import os
 import posixpath
 import sys
@ -111,6 +113,7 @@ class PurePath(_abc.PurePathBase):
        '_hash',
    )
    parser = os.path
    _globber = glob._Globber
    def __new__(cls, *args, **kwargs):
        """Construct a PurePath from one or several strings and or existing
@ -253,14 +256,17 @@ class PurePath(_abc.PurePathBase):
        return cls.parser.sep.join(tail)
    def _from_parsed_parts(self, drv, root, tail):
-        path_str = self._format_parsed_parts(drv, root, tail)
+        path = self._from_parsed_string(self._format_parsed_parts(drv, root, tail))
        path = self.with_segments(path_str)
        path._str = path_str or '.'
        path._drv = drv
        path._root = root
        path._tail_cached = tail
        return path
    def _from_parsed_string(self, path_str):
        path = self.with_segments(path_str)
        path._str = path_str or '.'
        return path
    @classmethod
    def _parse_path(cls, path):
        if not path:
@ -453,21 +459,6 @@ class PurePath(_abc.PurePathBase):
        from urllib.parse import quote_from_bytes
        return prefix + quote_from_bytes(os.fsencode(path))
    @property
    def _pattern_stack(self):
        """Stack of path components, to be used with patterns in glob()."""
        parts = self._tail.copy()
        pattern = self._raw_path
        if self.anchor:
            raise NotImplementedError("Non-relative patterns are unsupported")
        elif not parts:
            raise ValueError("Unacceptable pattern: {!r}".format(pattern))
        elif pattern[-1] in (self.parser.sep, self.parser.altsep):
            # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
            parts.append('')
        parts.reverse()
        return parts
    @property
    def _pattern_str(self):
        """The path expressed as a string, for use in pattern-matching."""
@ -576,6 +567,17 @@ class Path(_abc.PathBase, PurePath):
        encoding = io.text_encoding(encoding)
        return _abc.PathBase.write_text(self, data, encoding, errors, newline)
    _remove_leading_dot = operator.itemgetter(slice(2, None))
    _remove_trailing_slash = operator.itemgetter(slice(-1))
    def _filter_trailing_slash(self, paths):
        sep = self.parser.sep
        anchor_len = len(self.anchor)
        for path_str in paths:
            if len(path_str) > anchor_len and path_str[-1] == sep:
                path_str = path_str[:-1]
            yield path_str
    def iterdir(self):
        """Yield path objects of the directory contents.
@ -587,13 +589,9 @@ class Path(_abc.PathBase, PurePath):
    def _scandir(self):
        return os.scandir(self)
    def _direntry_str(self, entry):
        # Transform an entry yielded from _scandir() into a path string.
        return entry.name if str(self) == '.' else entry.path
    def _make_child_direntry(self, entry):
        # Transform an entry yielded from _scandir() into a path object.
-        path_str = self._direntry_str(entry)
+        path_str = entry.name if str(self) == '.' else entry.path
        path = self.with_segments(path_str)
        path._str = path_str
        path._drv = self.drive
@ -626,8 +624,30 @@ class Path(_abc.PathBase, PurePath):
        sys.audit("pathlib.Path.glob", self, pattern)
        if not isinstance(pattern, PurePath):
            pattern = self.with_segments(pattern)
-        return _abc.PathBase.glob(
+        if pattern.anchor:
-            self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
+            raise NotImplementedError("Non-relative patterns are unsupported")
        parts = pattern._tail.copy()
        if not parts:
            raise ValueError("Unacceptable pattern: {!r}".format(pattern))
        raw = pattern._raw_path
        if raw[-1] in (self.parser.sep, self.parser.altsep):
            # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
            parts.append('')
        if not self.is_dir():
            return iter([])
        select = self._glob_selector(parts[::-1], case_sensitive, recurse_symlinks)
        root = str(self)
        paths = select(root, exists=True)
        # Normalize results
        if root == '.':
            paths = map(self._remove_leading_dot, paths)
        if parts[-1] == '':
            paths = map(self._remove_trailing_slash, paths)
        elif parts[-1] == '**':
            paths = self._filter_trailing_slash(paths)
        paths = map(self._from_parsed_string, paths)
        return paths
    def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
        """Recursively yield all existing files (of any kind, including
@ -638,8 +658,7 @@ class Path(_abc.PathBase, PurePath):
        if not isinstance(pattern, PurePath):
            pattern = self.with_segments(pattern)
        pattern = '**' / pattern
-        return _abc.PathBase.glob(
+        return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
            self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
    def walk(self, top_down=True, on_error=None, follow_symlinks=False):
        """Walk the directory tree from this directory, similar to os.walk()."""
@ -669,9 +688,7 @@ class Path(_abc.PathBase, PurePath):
            # of joining, and we exploit the fact that getcwd() returns a
            # fully-normalized string by storing it in _str. This is used to
            # implement Path.cwd().
-            result = self.with_segments(cwd)
+            return self._from_parsed_string(cwd)
            result._str = cwd
            return result
        drive, root, rel = os.path.splitroot(cwd)
        if not rel:
            return self._from_parsed_parts(drive, root, self._tail)
--- a/Lib/pathlib/_abc.py
+++ b/Lib/pathlib/_abc.py
@ -12,6 +12,8 @@ resemble pathlib's PurePath and Path respectively.
 """
 import functools
 import glob
 import operator
 from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
 from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO
@ -40,109 +42,23 @@ def _ignore_error(exception):
 def _is_case_sensitive(parser):
    return parser.normcase('Aa') == 'Aa'
 #
 # Globbing helpers
 #
-re = glob = None
+class Globber(glob._Globber):
    lstat = operator.methodcaller('lstat')
    scandir = operator.methodcaller('_scandir')
    add_slash = operator.methodcaller('joinpath', '')
    @staticmethod
    def concat_path(path, text):
        """Appends text to the given path.
        """
        return path.with_segments(path._raw_path + text)
-@functools.lru_cache(maxsize=512)
+    @staticmethod
-def _compile_pattern(pat, sep, case_sensitive, recursive=True):
+    def parse_entry(entry):
-    """Compile given glob pattern to a re.Pattern object (observing case
+        """Returns the path of an entry yielded from scandir().
-    sensitivity)."""
+        """
-    global re, glob
+        return entry
    if re is None:
        import re, glob
    flags = re.NOFLAG if case_sensitive else re.IGNORECASE
    regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep)
    return re.compile(regex, flags=flags).match
 def _select_special(paths, part):
    """Yield special literal children of the given paths."""
    for path in paths:
        yield path._make_child_relpath(part)
 def _select_children(parent_paths, dir_only, match):
    """Yield direct children of given paths, filtering by name and type."""
    for parent_path in parent_paths:
        try:
            # We must close the scandir() object before proceeding to
            # avoid exhausting file descriptors when globbing deep trees.
            with parent_path._scandir() as scandir_it:
                entries = list(scandir_it)
        except OSError:
            pass
        else:
            for entry in entries:
                if dir_only:
                    try:
                        if not entry.is_dir():
                            continue
                    except OSError:
                        continue
                # Avoid cost of making a path object for non-matching paths by
                # matching against the os.DirEntry.name string.
                if match is None or match(entry.name):
                    yield parent_path._make_child_direntry(entry)
 def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
    """Yield given paths and all their children, recursively, filtering by
    string and type.
    """
    for parent_path in parent_paths:
        if match is not None:
            # If we're filtering paths through a regex, record the length of
            # the parent path. We'll pass it to match(path, pos=...) later.
            parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
        paths = [parent_path._make_child_relpath('')]
        while paths:
            path = paths.pop()
            if match is None or match(str(path), parent_len):
                # Yield *directory* path that matches pattern (if any).
                yield path
            try:
                # We must close the scandir() object before proceeding to
                # avoid exhausting file descriptors when globbing deep trees.
                with path._scandir() as scandir_it:
                    entries = list(scandir_it)
            except OSError:
                pass
            else:
                for entry in entries:
                    # Handle directory entry.
                    try:
                        if entry.is_dir(follow_symlinks=follow_symlinks):
                            # Recurse into this directory.
                            paths.append(path._make_child_direntry(entry))
                            continue
                    except OSError:
                        pass
                    # Handle file entry.
                    if not dir_only:
                        # Avoid cost of making a path object for non-matching
                        # files by matching against the os.DirEntry object.
                        if match is None or match(path._direntry_str(entry), parent_len):
                            # Yield *file* path that matches pattern (if any).
                            yield path._make_child_direntry(entry)
 def _select_unique(paths):
    """Yields the given paths, filtering out duplicates."""
    yielded = set()
    try:
        for path in paths:
            path_str = str(path)
            if path_str not in yielded:
                yield path
                yielded.add(path_str)
    finally:
        yielded.clear()
 class UnsupportedOperation(NotImplementedError):
@ -218,6 +134,7 @@ class PurePathBase:
        '_resolving',
    )
    parser = ParserBase()
    _globber = Globber
    def __init__(self, path, *paths):
        self._raw_path = self.parser.join(path, *paths) if paths else path
@ -454,14 +371,6 @@ class PurePathBase:
        a drive)."""
        return self.parser.isabs(self._raw_path)
    @property
    def _pattern_stack(self):
        """Stack of path components, to be used with patterns in glob()."""
        anchor, parts = self._stack
        if anchor:
            raise NotImplementedError("Non-relative patterns are unsupported")
        return parts
    @property
    def _pattern_str(self):
        """The path expressed as a string, for use in pattern-matching."""
@ -487,8 +396,9 @@ class PurePathBase:
            return False
        if len(path_parts) > len(pattern_parts) and path_pattern.anchor:
            return False
        globber = self._globber(sep, case_sensitive)
        for path_part, pattern_part in zip(path_parts, pattern_parts):
-            match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False)
+            match = globber.compile(pattern_part)
            if match(path_part) is None:
                return False
        return True
@ -502,7 +412,8 @@ class PurePathBase:
            pattern = self.with_segments(pattern)
        if case_sensitive is None:
            case_sensitive = _is_case_sensitive(self.parser)
-        match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive)
+        globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True)
        match = globber.compile(pattern._pattern_str)
        return match(self._pattern_str) is not None
@ -772,11 +683,6 @@ class PathBase(PurePathBase):
        from contextlib import nullcontext
        return nullcontext(self.iterdir())
    def _direntry_str(self, entry):
        # Transform an entry yielded from _scandir() into a path string.
        # PathBase._scandir() yields PathBase objects, so use str().
        return str(entry)
    def _make_child_direntry(self, entry):
        # Transform an entry yielded from _scandir() into a path object.
        # PathBase._scandir() yields PathBase objects, so this is a no-op.
@ -785,62 +691,26 @@ class PathBase(PurePathBase):
    def _make_child_relpath(self, name):
        return self.joinpath(name)
    def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
        if case_sensitive is None:
            case_sensitive = _is_case_sensitive(self.parser)
        recursive = True if recurse_symlinks else glob._no_recurse_symlinks
        globber = self._globber(self.parser.sep, case_sensitive, recursive)
        return globber.selector(parts)
    def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
        """Iterate over this subtree and yield all existing files (of any
        kind, including directories) matching the given relative pattern.
        """
        if not isinstance(pattern, PurePathBase):
            pattern = self.with_segments(pattern)
-        if case_sensitive is None:
+        anchor, parts = pattern._stack
-            # TODO: evaluate case-sensitivity of each directory in _select_children().
+        if anchor:
-            case_sensitive = _is_case_sensitive(self.parser)
+            raise NotImplementedError("Non-relative patterns are unsupported")
-
+        if not self.is_dir():
-        stack = pattern._pattern_stack
+            return iter([])
-        specials = ('', '.', '..')
+        select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
-        deduplicate_paths = False
+        return select(self, exists=True)
        sep = self.parser.sep
        paths = iter([self] if self.is_dir() else [])
        while stack:
            part = stack.pop()
            if part in specials:
                # Join special component (e.g. '..') onto paths.
                paths = _select_special(paths, part)
            elif part == '**':
                # Consume following '**' components, which have no effect.
                while stack and stack[-1] == '**':
                    stack.pop()
                # Consume following non-special components, provided we're
                # treating symlinks consistently. Each component is joined
                # onto 'part', which is used to generate an re.Pattern object.
                if recurse_symlinks:
                    while stack and stack[-1] not in specials:
                        part += sep + stack.pop()
                # If the previous loop consumed pattern components, compile an
                # re.Pattern object based on those components.
                match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
                # Recursively walk directories, filtering by type and regex.
                paths = _select_recursive(paths, bool(stack), recurse_symlinks, match)
                # De-duplicate if we've already seen a '**' component.
                if deduplicate_paths:
                    paths = _select_unique(paths)
                deduplicate_paths = True
            elif '**' in part:
                raise ValueError("Invalid pattern: '**' can only be an entire path component")
            else:
                # If the pattern component isn't '*', compile an re.Pattern
                # object based on the component.
                match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
                # Iterate over directories' children filtering by type and regex.
                paths = _select_children(paths, bool(stack), match)
        return paths
    def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
        """Recursively yield all existing files (of any kind, including
--- a/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst
+++ b/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst
@ -0,0 +1 @@
 Speed up :meth:`pathlib.Path.glob` by working with strings internally.
		`@ -0,0 +1 @@`
							Speed up :meth:`pathlib.Path.glob` by working with strings internally.