GH-72904: Add glob.translate() function (#106703)

Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`. This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment. In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code. Co-authored-by: Jason R. Coombs <jaraco@jaraco.com> Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
2025-10-10 00:43:41 +00:00 · 2023-11-13 17:15:56 +00:00 · 2023-11-13 17:15:56 +00:00 · cf67ebfb31
commit cf67ebfb31
parent babb787047
7 changed files with 229 additions and 106 deletions
--- a/Lib/glob.py
+++ b/Lib/glob.py
@ -249,3 +249,63 @@ def escape(pathname):


 _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
+
+
+def translate(pat, *, recursive=False, include_hidden=False, seps=None):
+    """Translate a pathname with shell wildcards to a regular expression.
+
+    If `recursive` is true, the pattern segment '**' will match any number of
+    path segments; if '**' appears outside its own segment, ValueError will be
+    raised.
+
+    If `include_hidden` is true, wildcards can match path segments beginning
+    with a dot ('.').
+
+    If a sequence of separator characters is given to `seps`, they will be
+    used to split the pattern into segments and match path separators. If not
+    given, os.path.sep and os.path.altsep (where available) are used.
+    """
+    if not seps:
+        if os.path.altsep:
+            seps = (os.path.sep, os.path.altsep)
+        else:
+            seps = os.path.sep
+    escaped_seps = ''.join(map(re.escape, seps))
+    any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
+    not_sep = f'[^{escaped_seps}]'
+    if include_hidden:
+        one_last_segment = f'{not_sep}+'
+        one_segment = f'{one_last_segment}{any_sep}'
+        any_segments = f'(?:.+{any_sep})?'
+        any_last_segments = '.*'
+    else:
+        one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
+        one_segment = f'{one_last_segment}{any_sep}'
+        any_segments = f'(?:{one_segment})*'
+        any_last_segments = f'{any_segments}(?:{one_last_segment})?'
+
+    results = []
+    parts = re.split(any_sep, pat)
+    last_part_idx = len(parts) - 1
+    for idx, part in enumerate(parts):
+        if part == '*':
+            results.append(one_segment if idx < last_part_idx else one_last_segment)
+            continue
+        if recursive:
+            if part == '**':
+                if idx < last_part_idx:
+                    if parts[idx + 1] != '**':
+                        results.append(any_segments)
+                else:
+                    results.append(any_last_segments)
+                continue
+            elif '**' in part:
+                raise ValueError("Invalid pattern: '**' can only be an entire path component")
+        if part:
+            if not include_hidden and part[0] in '*?':
+                results.append(r'(?!\.)')
+            results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
+        if idx < last_part_idx:
+            results.append(any_sep)
+    res = ''.join(results)
+    return fr'(?s:{res})\Z'