Merge 3c7c52bcaf into f609e1ddaf

2025-07-07 13:25:00 +00:00 · 2025-07-06 21:13:53 +02:00 · 2025-07-06 21:13:53 +02:00 · 05b1f1a469
commit 05b1f1a469
parent f609e1ddaf 3c7c52bcaf
3 changed files with 330 additions and 22 deletions
--- a/crates/uv-cache-info/src/cache_info.rs
+++ b/crates/uv-cache-info/src/cache_info.rs
@ -7,6 +7,7 @@ use serde::Deserialize;
 use tracing::{debug, warn};

 use crate::git_info::{Commit, Tags};
+use crate::glob::cluster_globs;
 use crate::timestamp::Timestamp;

 #[derive(Debug, thiserror::Error)]
@ -212,34 +213,39 @@ impl CacheInfo {
            }
        }

-        // If we have any globs, process them in a single pass.
+        // If we have any globs, first cluster them using LCP and then do a single pass on each group.
        if !globs.is_empty() {
-            let walker = globwalk::GlobWalkerBuilder::from_patterns(directory, &globs)
+            for (glob_base, glob_patterns) in cluster_globs(&globs) {
+                let walker = globwalk::GlobWalkerBuilder::from_patterns(
+                    directory.join(glob_base),
+                    &glob_patterns,
+                )
                .file_type(globwalk::FileType::FILE | globwalk::FileType::SYMLINK)
                .build()?;
-            for entry in walker {
-                let entry = match entry {
-                    Ok(entry) => entry,
-                    Err(err) => {
-                        warn!("Failed to read glob entry: {err}");
+                for entry in walker {
+                    let entry = match entry {
+                        Ok(entry) => entry,
+                        Err(err) => {
+                            warn!("Failed to read glob entry: {err}");
+                            continue;
+                        }
+                    };
+                    let metadata = match entry.metadata() {
+                        Ok(metadata) => metadata,
+                        Err(err) => {
+                            warn!("Failed to read metadata for glob entry: {err}");
+                            continue;
+                        }
+                    };
+                    if !metadata.is_file() {
+                        warn!(
+                            "Expected file for cache key, but found directory: `{}`",
+                            entry.path().display()
+                        );
                        continue;
                    }
-                };
-                let metadata = match entry.metadata() {
-                    Ok(metadata) => metadata,
-                    Err(err) => {
-                        warn!("Failed to read metadata for glob entry: {err}");
-                        continue;
-                    }
-                };
-                if !metadata.is_file() {
-                    warn!(
-                        "Expected file for cache key, but found directory: `{}`",
-                        entry.path().display()
-                    );
-                    continue;
+                    timestamp = max(timestamp, Some(Timestamp::from_metadata(&metadata)));
                }
-                timestamp = max(timestamp, Some(Timestamp::from_metadata(&metadata)));
            }
        }

--- a/crates/uv-cache-info/src/glob.rs
+++ b/crates/uv-cache-info/src/glob.rs
@ -0,0 +1,301 @@
+use std::{
+    collections::BTreeMap,
+    path::{Component, Components, Path, PathBuf},
+};
+
+/// Check if a component of the path looks like it may be a glob pattern.
+///
+/// Note: this function is being used when splitting a glob pattern into a long possible
+/// base and the glob remainder (scanning through components until we hit the first component
+/// for which this function returns true). It is acceptable for this function to return
+/// false positives (e.g. patterns like 'foo[bar' or 'foo{bar') in which case correctness
+/// will not be affected but efficiency might be (because we'll traverse more than we should),
+/// however it should not return false negatives.
+fn is_glob_like(part: Component) -> bool {
+    matches!(part, Component::Normal(_))
+        && part.as_os_str().to_str().is_some_and(|part| {
+            ["*", "{", "}", "?", "[", "]"]
+                .into_iter()
+                .any(|c| part.contains(c))
+        })
+}
+
+#[derive(Debug, Default, Clone, PartialEq, Eq)]
+struct GlobParts {
+    base: PathBuf,
+    pattern: PathBuf,
+}
+
+/// Split a glob into longest possible base + shortest possible glob pattern.
+fn split_glob(pattern: impl AsRef<str>) -> GlobParts {
+    let pattern: &Path = pattern.as_ref().as_ref();
+
+    let mut glob = GlobParts::default();
+    let mut globbing = false;
+    let mut last = None;
+
+    for part in pattern.components() {
+        if let Some(last) = last {
+            if last != Component::CurDir {
+                if globbing {
+                    glob.pattern.push(last);
+                } else {
+                    glob.base.push(last);
+                }
+            }
+        }
+        if !globbing {
+            globbing = is_glob_like(part);
+        }
+        // we don't know if this part is the last one, defer handling it by one iteration
+        last = Some(part);
+    }
+
+    if let Some(last) = last {
+        // defer handling the last component to prevent draining entire pattern into base
+        if globbing || matches!(last, Component::Normal(_)) {
+            glob.pattern.push(last);
+        } else {
+            glob.base.push(last);
+        }
+    }
+    glob
+}
+
+/// Classic trie with edges being path components and values being glob patterns.
+#[derive(Default)]
+struct Trie<'a> {
+    children: BTreeMap<Component<'a>, Trie<'a>>,
+    patterns: Vec<&'a Path>,
+}
+
+impl<'a> Trie<'a> {
+    fn insert(&mut self, mut components: Components<'a>, pattern: &'a Path) {
+        if let Some(part) = components.next() {
+            self.children
+                .entry(part)
+                .or_default()
+                .insert(components, pattern);
+        } else {
+            self.patterns.push(pattern);
+        }
+    }
+
+    #[allow(clippy::needless_pass_by_value)]
+    fn collect_patterns(
+        &self,
+        pattern_prefix: PathBuf,
+        group_prefix: PathBuf,
+        patterns: &mut Vec<PathBuf>,
+        groups: &mut Vec<(PathBuf, Vec<PathBuf>)>,
+    ) {
+        // collect all patterns beneath and including this node
+        for pattern in &self.patterns {
+            patterns.push(pattern_prefix.join(pattern));
+        }
+        for (part, child) in &self.children {
+            if let Component::Normal(_) = part {
+                // for normal components, collect all descendant patterns ('normal' edges only)
+                child.collect_patterns(
+                    pattern_prefix.join(part),
+                    group_prefix.join(part),
+                    patterns,
+                    groups,
+                );
+            } else {
+                // for non-normal component edges, kick off separate group collection at this node
+                child.collect_groups(group_prefix.join(part), groups);
+            }
+        }
+    }
+
+    #[allow(clippy::needless_pass_by_value)]
+    fn collect_groups(&self, prefix: PathBuf, groups: &mut Vec<(PathBuf, Vec<PathBuf>)>) {
+        // LCP-style grouping of patterns
+        if self.patterns.is_empty() {
+            // no patterns in this node; child nodes can form independent groups
+            for (part, child) in &self.children {
+                child.collect_groups(prefix.join(part), groups);
+            }
+        } else {
+            // pivot point, we've hit a pattern node; we have to stop here and form a group
+            let mut group = Vec::new();
+            self.collect_patterns(PathBuf::new(), prefix.clone(), &mut group, groups);
+            groups.push((prefix, group));
+        }
+    }
+}
+
+/// Given a collection of globs, cluster them into (base, globs) groups so that:
+/// - base doesn't contain any glob symbols
+/// - each directory would only be walked at most once
+/// - base of each group is the longest common prefix of globs in the group
+pub(crate) fn cluster_globs(patterns: &[impl AsRef<str>]) -> Vec<(PathBuf, Vec<String>)> {
+    // split all globs into base/pattern
+    let globs: Vec<_> = patterns.iter().map(split_glob).collect();
+
+    // construct a path trie out of all split globs
+    let mut trie = Trie::default();
+    for glob in &globs {
+        trie.insert(glob.base.components(), &glob.pattern);
+    }
+
+    // run LCP-style aggregation of patterns in the trie into groups
+    let mut groups = Vec::new();
+    trie.collect_groups(PathBuf::new(), &mut groups);
+
+    // finally, convert resulting patterns to strings
+    groups
+        .into_iter()
+        .map(|(base, patterns)| {
+            (
+                base,
+                patterns
+                    .iter()
+                    // NOTE: this unwrap is ok because input patterns are valid utf-8
+                    .map(|p| p.to_str().unwrap().to_owned())
+                    .collect(),
+            )
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{cluster_globs, split_glob, GlobParts};
+
+    fn windowsify(path: &str) -> String {
+        if cfg!(windows) {
+            path.replace('/', "\\")
+        } else {
+            path.to_owned()
+        }
+    }
+
+    #[test]
+    fn test_split_glob() {
+        #[track_caller]
+        fn check(input: &str, base: &str, pattern: &str) {
+            let result = split_glob(input);
+            let expected = GlobParts {
+                base: base.into(),
+                pattern: pattern.into(),
+            };
+            assert_eq!(result, expected, "{input:?} != {base:?} + {pattern:?}");
+        }
+
+        check("", "", "");
+        check("a", "", "a");
+        check("a/b", "a", "b");
+        check("a/b/", "a", "b");
+        check("a/.//b/", "a", "b");
+        check("./a/b/c", "a/b", "c");
+        check("c/d/*", "c/d", "*");
+        check("c/d/*/../*", "c/d", "*/../*");
+        check("a/?b/c", "a", "?b/c");
+        check("/a/b/*", "/a/b", "*");
+        check("../x/*", "../x", "*");
+        check("a/{b,c}/d", "a", "{b,c}/d");
+        check("a/[bc]/d", "a", "[bc]/d");
+        check("*", "", "*");
+        check("*/*", "", "*/*");
+        check("..", "..", "");
+        check("/", "/", "");
+    }
+
+    #[test]
+    fn test_cluster_globs() {
+        #[track_caller]
+        fn check(input: &[&str], expected: &[(&str, &[&str])]) {
+            let input = input.iter().map(|s| windowsify(s)).collect::<Vec<_>>();
+
+            let mut result_sorted = cluster_globs(&input);
+            for (_, patterns) in &mut result_sorted {
+                patterns.sort_unstable();
+            }
+            result_sorted.sort_unstable();
+
+            let mut expected_sorted = Vec::new();
+            for (base, patterns) in expected {
+                let mut patterns_sorted = Vec::new();
+                for pattern in *patterns {
+                    patterns_sorted.push(windowsify(pattern));
+                }
+                patterns_sorted.sort_unstable();
+                expected_sorted.push((windowsify(base).into(), patterns_sorted));
+            }
+            expected_sorted.sort_unstable();
+
+            assert_eq!(
+                result_sorted, expected_sorted,
+                "{input:?} != {expected_sorted:?} (got: {result_sorted:?})"
+            );
+        }
+
+        check(&["a/b/*", "a/c/*"], &[("a/b", &["*"]), ("a/c", &["*"])]);
+        check(&["./a/b/*", "a/c/*"], &[("a/b", &["*"]), ("a/c", &["*"])]);
+        check(&["/a/b/*", "/a/c/*"], &[("/a/b", &["*"]), ("/a/c", &["*"])]);
+        check(
+            &["../a/b/*", "../a/c/*"],
+            &[("../a/b", &["*"]), ("../a/c", &["*"])],
+        );
+        check(&["x/*", "y/*"], &[("x", &["*"]), ("y", &["*"])]);
+        check(&[], &[]);
+        check(
+            &["./*", "a/*", "../foo/*.png"],
+            &[("", &["*", "a/*"]), ("../foo", &["*.png"])],
+        );
+        check(
+            &[
+                "?",
+                "/foo/?",
+                "/foo/bar/*",
+                "../bar/*.png",
+                "../bar/../baz/*.jpg",
+            ],
+            &[
+                ("", &["?"]),
+                ("/foo", &["?", "bar/*"]),
+                ("../bar", &["*.png"]),
+                ("../bar/../baz", &["*.jpg"]),
+            ],
+        );
+        check(&["/abs/path/*"], &[("/abs/path", &["*"])]);
+        check(&["/abs/*", "rel/*"], &[("/abs", &["*"]), ("rel", &["*"])]);
+        check(&["a/{b,c}/*", "a/d?/*"], &[("a", &["{b,c}/*", "d?/*"])]);
+        check(
+            &[
+                "../shared/a/[abc].png",
+                "../shared/a/b/*",
+                "../shared/b/c/?x/d",
+                "docs/important/*.{doc,xls}",
+                "docs/important/very/*",
+            ],
+            &[
+                ("../shared/a", &["[abc].png", "b/*"]),
+                ("../shared/b/c", &["?x/d"]),
+                ("docs/important", &["*.{doc,xls}", "very/*"]),
+            ],
+        );
+        check(&["file.txt"], &[("", &["file.txt"])]);
+        check(&["/"], &[("/", &[""])]);
+        check(&[".."], &[("..", &[""])]);
+        check(
+            &["file1.txt", "file2.txt"],
+            &[("", &["file1.txt", "file2.txt"])],
+        );
+        check(
+            &["a/file1.txt", "a/file2.txt"],
+            &[("a", &["file1.txt", "file2.txt"])],
+        );
+        check(
+            &["*", "a/b/*", "a/../c/*.jpg", "a/../c/*.png", "/a/*", "/b/*"],
+            &[
+                ("", &["*", "a/b/*"]),
+                ("a/../c", &["*.jpg", "*.png"]),
+                ("/a", &["*"]),
+                ("/b", &["*"]),
+            ],
+        );
+    }
+}
--- a/crates/uv-cache-info/src/lib.rs
+++ b/crates/uv-cache-info/src/lib.rs
@ -3,4 +3,5 @@ pub use crate::timestamp::*;

 mod cache_info;
 mod git_info;
+mod glob;
 mod timestamp;