GH-102613: Improve performance of pathlib.Path.rglob() (GH-104244)

Stop de-duplicating results in `_RecursiveWildcardSelector`. A new
`_DoubleRecursiveWildcardSelector` class is introduced which performs
de-duplication, but this is used _only_ for patterns with multiple
non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding
the use of a set, `PurePath.__hash__()` is not called, and so paths do not
need to be stringified and case-normalised.

Also merge adjacent '**' segments in patterns.
This commit is contained in:
Barney Gale 2023-05-07 22:12:50 +01:00 committed by GitHub
parent 8d95012c95
commit c0ece3dc97
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 18 deletions

View file

@ -1853,13 +1853,14 @@ class _BasePathTest(object):
def test_rglob_common(self):
def _check(glob, expected):
self.assertEqual(set(glob), { P(BASE, q) for q in expected })
self.assertEqual(sorted(glob), sorted(P(BASE, q) for q in expected))
P = self.cls
p = P(BASE)
it = p.rglob("fileA")
self.assertIsInstance(it, collections.abc.Iterator)
_check(it, ["fileA"])
_check(p.rglob("fileB"), ["dirB/fileB"])
_check(p.rglob("**/fileB"), ["dirB/fileB"])
_check(p.rglob("*/fileA"), [])
if not os_helper.can_symlink():
_check(p.rglob("*/fileB"), ["dirB/fileB"])
@ -1883,9 +1884,12 @@ class _BasePathTest(object):
_check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
"dirC/dirD", "dirC/dirD/fileD"])
_check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
_check(p.rglob("**/file*"), ["dirC/fileC", "dirC/dirD/fileD"])
_check(p.rglob("dir*/**"), ["dirC/dirD"])
_check(p.rglob("*/*"), ["dirC/dirD/fileD"])
_check(p.rglob("*/"), ["dirC/dirD"])
_check(p.rglob(""), ["dirC", "dirC/dirD"])
_check(p.rglob("**"), ["dirC", "dirC/dirD"])
# gh-91616, a re module regression
_check(p.rglob("*.txt"), ["dirC/novel.txt"])
_check(p.rglob("*.*"), ["dirC/novel.txt"])