mirror of
https://github.com/astral-sh/ruff.git
synced 2025-10-03 15:15:33 +00:00
406 lines
14 KiB
Rust
406 lines
14 KiB
Rust
use globset::{Glob, GlobBuilder, GlobSet, GlobSetBuilder};
|
|
use regex_automata::dfa;
|
|
use regex_automata::dfa::Automaton;
|
|
use ruff_db::system::SystemPath;
|
|
use std::fmt::Formatter;
|
|
use std::path::{MAIN_SEPARATOR, MAIN_SEPARATOR_STR};
|
|
use tracing::warn;
|
|
|
|
use crate::glob::portable::AbsolutePortableGlobPattern;
|
|
|
|
/// Chosen at a whim -Konsti
|
|
const DFA_SIZE_LIMIT: usize = 1_000_000;
|
|
|
|
/// Path filter based on a set of include globs.
|
|
///
|
|
/// The patterns are similar to gitignore, but reversed:
|
|
///
|
|
/// * `/src`: matches a file or directory with its content named `src`
|
|
/// * `/src/`: matches a directory with its content named `src`
|
|
/// * `/src/**` or `/src/*`: matches the content of `src`, but not a file named `src`
|
|
///
|
|
/// Negated patterns are not supported.
|
|
///
|
|
/// Internally, the globs are converted to a regex and then to a DFA, which unlike the globs and the
|
|
/// regex allows to check for prefix matches.
|
|
///
|
|
/// ## Equality
|
|
/// Equality is based on the patterns from which a filter was constructed.
|
|
///
|
|
/// Because of that, two filters that include the exact same files but were
|
|
/// constructed from different patterns (or even just order) compare unequal.
|
|
#[derive(Clone)]
|
|
pub(crate) struct IncludeFilter {
|
|
glob_set: GlobSet,
|
|
original_patterns: Box<[String]>,
|
|
dfa: Option<dfa::dense::DFA<Vec<u32>>>,
|
|
}
|
|
|
|
impl IncludeFilter {
|
|
/// Whether the file matches any of the globs.
|
|
pub(crate) fn match_file(&self, path: impl AsRef<SystemPath>) -> bool {
|
|
let path = path.as_ref();
|
|
|
|
self.glob_set.is_match(path)
|
|
}
|
|
|
|
/// Check whether a directory or any of its children can be matched by any of the globs.
|
|
///
|
|
/// This never returns `false` if any child matches, but it may return `true` even if we
|
|
/// don't end up including any child.
|
|
pub(crate) fn match_directory(&self, path: impl AsRef<SystemPath>) -> bool {
|
|
self.match_directory_impl(path.as_ref())
|
|
}
|
|
|
|
fn match_directory_impl(&self, path: &SystemPath) -> bool {
|
|
let Some(dfa) = &self.dfa else {
|
|
return true;
|
|
};
|
|
|
|
// Allow the root path
|
|
if path == SystemPath::new("") {
|
|
return true;
|
|
}
|
|
|
|
let config_anchored =
|
|
regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes);
|
|
let mut state = dfa.start_state(&config_anchored).unwrap();
|
|
|
|
let byte_path = path
|
|
.as_str()
|
|
.strip_suffix('/')
|
|
.unwrap_or(path.as_str())
|
|
.as_bytes();
|
|
for b in byte_path {
|
|
state = dfa.next_state(state, *b);
|
|
}
|
|
// Say we're looking at a directory `foo/bar`. We want to continue if either `foo/bar` is
|
|
// a match, e.g., from `foo/*`, or a path below it can match, e.g., from `foo/bar/*`.
|
|
let eoi_state = dfa.next_eoi_state(state);
|
|
// We must not call `next_eoi_state` on the slash state, we want to only check if more
|
|
// characters (path components) are allowed, not if we're matching the `$` anchor at the
|
|
// end.
|
|
let slash_state = dfa.next_state(state, u8::try_from(MAIN_SEPARATOR).unwrap());
|
|
|
|
debug_assert!(
|
|
!dfa.is_quit_state(eoi_state) && !dfa.is_quit_state(slash_state),
|
|
"matcher is in quit state"
|
|
);
|
|
|
|
dfa.is_match_state(eoi_state) || !dfa.is_dead_state(slash_state)
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Debug for IncludeFilter {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.debug_tuple("IncludeFilter")
|
|
.field(&self.original_patterns)
|
|
.finish_non_exhaustive()
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Display for IncludeFilter {
|
|
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
|
f.debug_list().entries(&self.original_patterns).finish()
|
|
}
|
|
}
|
|
|
|
impl PartialEq for IncludeFilter {
|
|
fn eq(&self, other: &Self) -> bool {
|
|
self.original_patterns == other.original_patterns
|
|
}
|
|
}
|
|
|
|
impl Eq for IncludeFilter {}
|
|
|
|
#[derive(Debug)]
|
|
pub(crate) struct IncludeFilterBuilder {
|
|
set: GlobSetBuilder,
|
|
original_pattern: Vec<String>,
|
|
regexes: Vec<String>,
|
|
}
|
|
|
|
impl IncludeFilterBuilder {
|
|
pub(crate) fn new() -> Self {
|
|
Self {
|
|
set: GlobSetBuilder::new(),
|
|
original_pattern: Vec::new(),
|
|
regexes: Vec::new(),
|
|
}
|
|
}
|
|
|
|
/// Adds an include pattern to the filter.
|
|
pub(crate) fn add(
|
|
&mut self,
|
|
input: &AbsolutePortableGlobPattern,
|
|
) -> Result<&mut Self, globset::Error> {
|
|
let mut glob_pattern = input.absolute();
|
|
|
|
let mut only_directory = false;
|
|
|
|
// A pattern ending with a `/` should only match directories. E.g. `src/` only matches directories
|
|
// whereas `src` matches both files and directories.
|
|
// We need to remove the `/` to ensure that a path missing the trailing `/` matches.
|
|
if let Some(after) = glob_pattern.strip_suffix('/') {
|
|
// Escaped `/` or `\` aren't allowed. `portable_glob::parse` will error
|
|
only_directory = true;
|
|
glob_pattern = after;
|
|
}
|
|
|
|
// If regex ends with `/**`, only push that one glob and regex
|
|
// Otherwise, push two regex, one for `/**` and one for without
|
|
let glob = GlobBuilder::new(glob_pattern)
|
|
.literal_separator(true)
|
|
// No need to support Windows-style paths, so the backslash can be used a escape.
|
|
.backslash_escape(true)
|
|
.build()?;
|
|
self.original_pattern.push(input.relative().to_string());
|
|
|
|
// `lib` is the same as `lib/**`
|
|
// Add a glob that matches `lib` exactly, change the glob to `lib/**`.
|
|
if glob_pattern.ends_with("**") {
|
|
self.push_prefix_regex(&glob);
|
|
self.set.add(glob);
|
|
} else {
|
|
let prefix_glob = GlobBuilder::new(&format!("{glob_pattern}/**"))
|
|
.literal_separator(true)
|
|
// No need to support Windows-style paths, so the backslash can be used a escape.
|
|
.backslash_escape(true)
|
|
.build()?;
|
|
|
|
self.push_prefix_regex(&prefix_glob);
|
|
self.set.add(prefix_glob);
|
|
|
|
// The reason we add the exact glob, e.g. `src` when the original pattern was `src/` is
|
|
// so that `match_file` returns true when matching against a file. However, we don't
|
|
// need to do this if this is a pattern that should only match a directory (specifically, its contents).
|
|
if !only_directory {
|
|
self.set.add(glob);
|
|
}
|
|
}
|
|
|
|
Ok(self)
|
|
}
|
|
|
|
fn push_prefix_regex(&mut self, glob: &Glob) {
|
|
let main_separator = regex::escape(MAIN_SEPARATOR_STR);
|
|
|
|
let regex = glob
|
|
.regex()
|
|
// We are using a custom DFA builder
|
|
.strip_prefix("(?-u)")
|
|
.expect("a glob is a non-unicode byte regex")
|
|
// Match windows paths if applicable
|
|
.replace('/', &main_separator);
|
|
|
|
self.regexes.push(regex);
|
|
}
|
|
|
|
/// The filter matches if any of the globs matches.
|
|
///
|
|
/// See <https://github.com/BurntSushi/ripgrep/discussions/2927> for the error returned.
|
|
pub(crate) fn build(self) -> Result<IncludeFilter, globset::Error> {
|
|
let glob_set = self.set.build()?;
|
|
|
|
let dfa_builder = dfa::dense::Builder::new()
|
|
.syntax(
|
|
// The glob regex is a byte matcher
|
|
regex_automata::util::syntax::Config::new()
|
|
.unicode(false)
|
|
.utf8(false),
|
|
)
|
|
.configure(
|
|
dfa::dense::Config::new()
|
|
.start_kind(dfa::StartKind::Anchored)
|
|
// DFA can grow exponentially, in which case we bail out
|
|
.dfa_size_limit(Some(DFA_SIZE_LIMIT))
|
|
.determinize_size_limit(Some(DFA_SIZE_LIMIT)),
|
|
)
|
|
.build_many(&self.regexes);
|
|
let dfa = if let Ok(dfa) = dfa_builder {
|
|
Some(dfa)
|
|
} else {
|
|
// TODO(konsti): `regex_automata::dfa::dense::BuildError` should allow asking whether
|
|
// is a size error
|
|
warn!(
|
|
"Glob expressions regex is larger than {DFA_SIZE_LIMIT} bytes, \
|
|
falling back to full directory traversal!"
|
|
);
|
|
None
|
|
};
|
|
|
|
Ok(IncludeFilter {
|
|
glob_set,
|
|
dfa,
|
|
original_patterns: self.original_pattern.into(),
|
|
})
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::path::{MAIN_SEPARATOR, MAIN_SEPARATOR_STR};
|
|
|
|
use crate::glob::include::{IncludeFilter, IncludeFilterBuilder};
|
|
use crate::glob::{PortableGlobKind, PortableGlobPattern};
|
|
use ruff_db::system::{MemoryFileSystem, walk_directory::WalkState};
|
|
|
|
fn create_filter(patterns: impl IntoIterator<Item = &'static str>) -> IncludeFilter {
|
|
let mut builder = IncludeFilterBuilder::new();
|
|
for pattern in patterns {
|
|
builder
|
|
.add(
|
|
&PortableGlobPattern::parse(pattern, PortableGlobKind::Include)
|
|
.unwrap()
|
|
.into_absolute(""),
|
|
)
|
|
.unwrap();
|
|
}
|
|
|
|
builder.build().unwrap()
|
|
}
|
|
|
|
fn setup_files(files: impl IntoIterator<Item = &'static str>) -> MemoryFileSystem {
|
|
let fs = MemoryFileSystem::new();
|
|
|
|
fs.write_files_all(files.into_iter().map(|name| (name, "")))
|
|
.unwrap();
|
|
fs
|
|
}
|
|
|
|
#[track_caller]
|
|
fn assert_match_directory(filter: &IncludeFilter, path: &str) {
|
|
assert!(filter.match_directory(path.replace('/', MAIN_SEPARATOR_STR)));
|
|
}
|
|
|
|
#[track_caller]
|
|
fn assert_not_match_directory(filter: &IncludeFilter, path: &str) {
|
|
assert!(!filter.match_directory(path.replace('/', MAIN_SEPARATOR_STR)));
|
|
}
|
|
|
|
#[test]
|
|
fn match_directory() {
|
|
// `lib` is the same as `src/**`. It includes a file or directory (including its contents)
|
|
// `src/*`: The same as `src/**`
|
|
let filter = create_filter(["lib", "src/*", "tests/**", "a/test-*/b", "files/*.py"]);
|
|
|
|
assert_match_directory(&filter, "lib");
|
|
assert_match_directory(&filter, "lib/more/test");
|
|
|
|
assert_match_directory(&filter, "src");
|
|
assert_match_directory(&filter, "src/more/test");
|
|
|
|
assert_match_directory(&filter, "tests");
|
|
assert_match_directory(&filter, "tests/more/test");
|
|
|
|
assert_match_directory(&filter, "a");
|
|
assert_match_directory(&filter, "a/test-b");
|
|
|
|
assert_not_match_directory(&filter, "a/test-b/x");
|
|
assert_not_match_directory(&filter, "a/test");
|
|
|
|
assert_match_directory(&filter, "files/a.py");
|
|
assert_match_directory(&filter, "files/a.py/bcd");
|
|
|
|
assert_not_match_directory(&filter, "not_included");
|
|
assert_not_match_directory(&filter, "files/a.pi");
|
|
}
|
|
|
|
#[test]
|
|
fn match_file() {
|
|
// `lib` is the same as `src/**`. It includes a file or directory (including its contents)
|
|
// `src/*`: The same as `src/**`
|
|
let filter = create_filter([
|
|
"lib",
|
|
"src/*",
|
|
"directory/",
|
|
"tests/**",
|
|
"a/test-*/b",
|
|
"files/*.py",
|
|
]);
|
|
|
|
assert!(filter.match_file("lib"));
|
|
assert!(filter.match_file("lib/more/test"));
|
|
|
|
// Unlike `directory`, `directory/` only includes a directory with the given name and its contents
|
|
assert!(!filter.match_file("directory"));
|
|
assert!(filter.match_file("directory/more/test"));
|
|
|
|
// Unlike `src`, `src/*` only includes a directory with the given name.
|
|
assert!(!filter.match_file("src"));
|
|
assert!(filter.match_file("src/more/test"));
|
|
|
|
// Unlike `tests`, `tests/**` only includes files under `tests`, but not a file named tests
|
|
assert!(!filter.match_file("tests"));
|
|
assert!(filter.match_file("tests/more/test"));
|
|
|
|
// Unlike `match_directory`, prefixes should not be included.
|
|
assert!(!filter.match_file("a"));
|
|
assert!(!filter.match_file("a/test-b"));
|
|
|
|
assert!(!filter.match_file("a/test-b/x"));
|
|
assert!(!filter.match_file("a/test"));
|
|
|
|
assert!(filter.match_file("files/a.py"));
|
|
assert!(filter.match_file("files/a.py/bcd"));
|
|
|
|
assert!(!filter.match_file("not_included"));
|
|
assert!(!filter.match_file("files/a.pi"));
|
|
}
|
|
|
|
/// Check that we skip directories that can never match.
|
|
#[test]
|
|
fn prefilter() {
|
|
let filter = create_filter(["/a/b/test-*/d", "/a/b/c/e", "/b/c"]);
|
|
let fs = setup_files([
|
|
// Should visit
|
|
"/a/b/test-a/d",
|
|
"/a/b/c/e",
|
|
"/b/c",
|
|
// Can skip
|
|
"/d/e",
|
|
"/a/b/x/f",
|
|
]);
|
|
|
|
let visited = std::sync::Mutex::new(Vec::new());
|
|
|
|
// Test the prefix filtering
|
|
fs.walk_directory("/").run(|| {
|
|
Box::new(|entry| {
|
|
let entry = entry.unwrap();
|
|
|
|
if entry.file_type().is_directory() {
|
|
if !filter.match_directory(entry.path()) {
|
|
return WalkState::Skip;
|
|
}
|
|
}
|
|
|
|
visited
|
|
.lock()
|
|
.unwrap()
|
|
.push(entry.path().as_str().replace(MAIN_SEPARATOR, "/"));
|
|
|
|
WalkState::Continue
|
|
})
|
|
});
|
|
|
|
let mut visited = visited.into_inner().unwrap();
|
|
visited.sort();
|
|
|
|
// Assert that it didn't traverse into `/d` or `/a/b/x`
|
|
assert_eq!(
|
|
visited,
|
|
[
|
|
"/",
|
|
"/a",
|
|
"/a/b",
|
|
"/a/b/c",
|
|
"/a/b/c/e",
|
|
"/a/b/test-a",
|
|
"/a/b/test-a/d",
|
|
"/b",
|
|
"/b/c"
|
|
]
|
|
);
|
|
}
|
|
}
|