GH-102613: Fast recursive globbing in pathlib.Path.glob() (GH-104512)

This commit introduces a 'walk-and-match' strategy for handling glob patterns that include a non-terminal `**` wildcard, such as `**/*.py`. For this example, the previous implementation recursively walked directories using `os.scandir()` when it expanded the `**` component, and then **scanned those same directories again** when expanded the `*.py` component. This is wasteful.

In the new implementation, any components following a `**` wildcard are used to build a `re.Pattern` object, which is used to filter the results of the recursive walk. A pattern like `**/*.py` uses half the number of `os.scandir()` calls; a pattern like `**/*/*.py` a third, etc.

This new algorithm does not apply if either:

1. The *follow_symlinks* argument is set to `None` (its default), or
2. The pattern contains `..` components.

In these cases we fall back to the old implementation.

This commit also replaces selector classes with selector functions. These generators directly yield results rather calling through to their successors. A new internal `Path._glob()` method takes care to chain these generators together, which simplifies the lazy algorithm and slightly improves performance. It should also be easier to understand and maintain.
This commit is contained in:
Barney Gale 2023-06-06 23:50:36 +01:00 committed by GitHub
parent 2587b9f64e
commit 24af45172f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 161 additions and 138 deletions

View file

@ -1898,6 +1898,16 @@ class _BasePathTest(object):
_check(p, "*B/*", ["dirB/fileB", "dirB/linkD", "linkB/fileB", "linkB/linkD"])
_check(p, "*/fileB", ["dirB/fileB", "linkB/fileB"])
_check(p, "*/", ["dirA", "dirB", "dirC", "dirE", "linkB"])
_check(p, "dir*/*/..", ["dirC/dirD/..", "dirA/linkC/.."])
_check(p, "dir*/**/", ["dirA", "dirA/linkC", "dirA/linkC/linkD", "dirB", "dirB/linkD",
"dirC", "dirC/dirD", "dirE"])
_check(p, "dir*/**/..", ["dirA/..", "dirA/linkC/..", "dirB/..",
"dirC/..", "dirC/dirD/..", "dirE/.."])
_check(p, "dir*/*/**/", ["dirA/linkC", "dirA/linkC/linkD", "dirB/linkD", "dirC/dirD"])
_check(p, "dir*/*/**/..", ["dirA/linkC/..", "dirC/dirD/.."])
_check(p, "dir*/**/fileC", ["dirC/fileC"])
_check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"])
_check(p, "*/dirD/**/", ["dirC/dirD"])
@os_helper.skip_unless_symlink
def test_glob_no_follow_symlinks_common(self):
@ -1912,6 +1922,14 @@ class _BasePathTest(object):
_check(p, "*B/*", ["dirB/fileB", "dirB/linkD"])
_check(p, "*/fileB", ["dirB/fileB"])
_check(p, "*/", ["dirA", "dirB", "dirC", "dirE"])
_check(p, "dir*/*/..", ["dirC/dirD/.."])
_check(p, "dir*/**/", ["dirA", "dirB", "dirC", "dirC/dirD", "dirE"])
_check(p, "dir*/**/..", ["dirA/..", "dirB/..", "dirC/..", "dirC/dirD/..", "dirE/.."])
_check(p, "dir*/*/**/", ["dirC/dirD"])
_check(p, "dir*/*/**/..", ["dirC/dirD/.."])
_check(p, "dir*/**/fileC", ["dirC/fileC"])
_check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"])
_check(p, "*/dirD/**/", ["dirC/dirD"])
def test_rglob_common(self):
def _check(glob, expected):