Build backend: Switch to custom glob-walkdir implementation (#9013)

When doing a directory traversal for source dist inclusion, we want to
offer the user include and exclude options, and we want to avoid
traversing irrelevant directories. The latter is important for
performance, especially on network file systems, but also with large
data directories, or (not-included) directories with other permissions.
To support this, we introduce `GlobDirFilter`, which uses a DFA from
regex_automata to determine whether any children of a directory can be
included and skips the directory if not.

The globs are based on PEP 639. The syntax is more restricted than glob
or globset, but it's standardized. I chose it over glob or globset
because we're already using this syntax for `project.license-files` a
required by PEP 639, so it makes sense to use the same globs for all
includes (see e.g.
4f52a3bb62/pyproject.toml (L36-L48)
for example with same semantics for include and exclude)

### Semantics

Glob semantics are complex due to mixing directories and files,
expectations around simplicity and our need to exclude most of the tree
in the project from traversal. The current draft uses a syntax that
optimizes for simple default use cases for the start.

#### includes

Glob expressions which files and directories to include in the source
distribution.

Includes are anchored, which means that `pyproject.toml` includes only
`<project root>/pyproject.toml`. Use for example `assets/**/sample.csv`
to include for all
`sample.csv` files in `<project root>/assets` or any child directory. To
recursively include
all files under a directory, use a `/**` suffix, e.g. `src/**`. For
performance and
reproducibility, avoid unanchored matches such as `**/sample.csv`.

The glob syntax is the reduced portable glob from
[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).

#### excludes

Glob expressions which files and directories to exclude from the
previous source
distribution includes.

Excludes are not, which means that `__pycache__` excludes all
directories named
`__pycache__` and it's children anywhere. To anchor a directory, use a
`/` prefix, e.g.,
`/dist` will exclude only `<project root>/dist`.

The glob syntax is the reduced portable glob from
[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
This commit is contained in:
konsti 2024-11-14 14:14:58 +01:00 committed by GitHub
parent e310dcc7c1
commit 4ac78f673b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 798 additions and 195 deletions

17
Cargo.lock generated
View file

@ -4522,7 +4522,6 @@ dependencies = [
"csv",
"flate2",
"fs-err",
"glob",
"globset",
"indoc",
"insta",
@ -4537,6 +4536,7 @@ dependencies = [
"tracing",
"uv-distribution-filename",
"uv-fs",
"uv-globfilter",
"uv-normalize",
"uv-pep440",
"uv-pep508",
@ -4974,6 +4974,21 @@ dependencies = [
"which",
]
[[package]]
name = "uv-globfilter"
version = "0.1.0"
dependencies = [
"fs-err",
"globset",
"insta",
"regex",
"regex-automata 0.4.8",
"tempfile",
"thiserror",
"tracing",
"walkdir",
]
[[package]]
name = "uv-install-wheel"
version = "0.0.1"

View file

@ -39,6 +39,7 @@ uv-distribution-types = { path = "crates/uv-distribution-types" }
uv-extract = { path = "crates/uv-extract" }
uv-fs = { path = "crates/uv-fs" }
uv-git = { path = "crates/uv-git" }
uv-globfilter = { path = "crates/uv-globfilter" }
uv-install-wheel = { path = "crates/uv-install-wheel", default-features = false }
uv-installer = { path = "crates/uv-installer" }
uv-macros = { path = "crates/uv-macros" }
@ -134,6 +135,7 @@ quote = { version = "1.0.37" }
rayon = { version = "1.10.0" }
reflink-copy = { version = "0.1.19" }
regex = { version = "1.10.6" }
regex-automata = { version = "0.4.8", default-features = false, features = ["dfa-build", "dfa-search", "perf", "std", "syntax"] }
reqwest = { version = "0.12.7", default-features = false, features = ["json", "gzip", "stream", "rustls-tls", "rustls-tls-native-roots", "socks", "multipart", "http2"] }
reqwest-middleware = { version = "0.4.0", features = ["multipart"] }
reqwest-retry = { version = "0.7.0" }

View file

@ -15,6 +15,7 @@ doctest = false
[dependencies]
uv-distribution-filename = { workspace = true }
uv-fs = { workspace = true }
uv-globfilter = { workspace = true }
uv-normalize = { workspace = true }
uv-pep440 = { workspace = true }
uv-pep508 = { workspace = true }
@ -24,7 +25,6 @@ uv-warnings = { workspace = true }
csv = { workspace = true }
flate2 = { workspace = true }
fs-err = { workspace = true }
glob = { workspace = true }
globset = { workspace = true }
itertools = { workspace = true }
serde = { workspace = true }

View file

@ -1,13 +1,10 @@
mod metadata;
mod pep639_glob;
use crate::metadata::{PyProjectToml, ValidationError};
use crate::pep639_glob::Pep639GlobError;
use flate2::write::GzEncoder;
use flate2::Compression;
use fs_err::File;
use glob::{GlobError, PatternError};
use globset::{Glob, GlobSetBuilder};
use globset::GlobSetBuilder;
use itertools::Itertools;
use sha2::{Digest, Sha256};
use std::fs::FileType;
@ -19,6 +16,7 @@ use thiserror::Error;
use tracing::{debug, trace};
use uv_distribution_filename::{SourceDistExtension, SourceDistFilename, WheelFilename};
use uv_fs::Simplified;
use uv_globfilter::{parse_portable_glob, GlobDirFilter, PortableGlobError};
use walkdir::WalkDir;
use zip::{CompressionMethod, ZipWriter};
@ -30,16 +28,26 @@ pub enum Error {
Toml(#[from] toml::de::Error),
#[error("Invalid pyproject.toml")]
Validation(#[from] ValidationError),
#[error("Invalid `project.license-files` glob expression: `{0}`")]
Pep639Glob(String, #[source] Pep639GlobError),
#[error("The `project.license-files` entry is not a valid glob pattern: `{0}`")]
Pattern(String, #[source] PatternError),
/// [`GlobError`] is a wrapped io error.
#[error(transparent)]
Glob(#[from] GlobError),
#[error("Unsupported glob expression in: `{field}`")]
PortableGlob {
field: String,
#[source]
source: PortableGlobError,
},
/// <https://github.com/BurntSushi/ripgrep/discussions/2927>
#[error("Glob expressions caused to large regex in: `{field}`")]
GlobSetTooLarge {
field: String,
#[source]
source: globset::Error,
},
/// [`globset::Error`] shows the glob that failed to parse.
#[error(transparent)]
GlobSet(#[from] globset::Error),
#[error("Unsupported glob expression in: `{field}`")]
GlobSet {
field: String,
#[source]
err: globset::Error,
},
#[error("Failed to walk source tree: `{}`", root.user_display())]
WalkDir {
root: PathBuf,
@ -322,7 +330,10 @@ pub fn build_wheel(
err,
})?;
let relative_path = entry.path().strip_prefix(&strip_root)?;
let relative_path = entry
.path()
.strip_prefix(&strip_root)
.expect("walkdir starts with root");
let relative_path_str = relative_path
.to_str()
.ok_or_else(|| Error::NotUtf8Path(relative_path.to_path_buf()))?;
@ -354,10 +365,52 @@ pub fn build_wheel(
Ok(filename)
}
/// TODO(konsti): Wire this up with actual settings and remove this struct.
///
/// To select which files to include in the source distribution, we first add the includes, then
/// remove the excludes from that.
pub struct SourceDistSettings {
/// Glob expressions which files and directories to include in the source distribution.
///
/// Includes are anchored, which means that `pyproject.toml` includes only
/// `<project root>/pyproject.toml`. Use for example `assets/**/sample.csv` to include for all
/// `sample.csv` files in `<project root>/assets` or any child directory. To recursively include
/// all files under a directory, use a `/**` suffix, e.g. `src/**`. For performance and
/// reproducibility, avoid unanchored matches such as `**/sample.csv`.
///
/// The glob syntax is the reduced portable glob from
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
include: Vec<String>,
/// Glob expressions which files and directories to exclude from the previous source
/// distribution includes.
///
/// Excludes are not anchored, which means that `__pycache__` excludes all directories named
/// `__pycache__` and it's children anywhere. To anchor a directory, use a `/` prefix, e.g.,
/// `/dist` will exclude only `<project root>/dist`.
///
/// The glob syntax is the reduced portable glob from
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
exclude: Vec<String>,
}
impl Default for SourceDistSettings {
fn default() -> Self {
Self {
include: vec!["src/**".to_string(), "pyproject.toml".to_string()],
exclude: vec![
"__pycache__".to_string(),
"*.pyc".to_string(),
"*.pyo".to_string(),
],
}
}
}
/// Build a source distribution from the source tree and place it in the output directory.
pub fn build_source_dist(
source_tree: &Path,
source_dist_directory: &Path,
settings: SourceDistSettings,
uv_version: &str,
) -> Result<SourceDistFilename, Error> {
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
@ -392,42 +445,75 @@ pub fn build_source_dist(
)
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
let includes = ["src/**/*", "pyproject.toml"];
let mut include_builder = GlobSetBuilder::new();
for include in includes {
include_builder.add(Glob::new(include)?);
let mut include_globs = Vec::new();
for include in settings.include {
let glob = parse_portable_glob(&include).map_err(|err| Error::PortableGlob {
field: "tool.uv.source-dist.include".to_string(),
source: err,
})?;
include_globs.push(glob.clone());
}
let include_matcher = include_builder.build()?;
let include_matcher =
GlobDirFilter::from_globs(&include_globs).map_err(|err| Error::GlobSetTooLarge {
field: "tool.uv.source-dist.include".to_string(),
source: err,
})?;
let excludes = ["__pycache__", "*.pyc", "*.pyo"];
let mut exclude_builder = GlobSetBuilder::new();
for exclude in excludes {
exclude_builder.add(Glob::new(exclude)?);
for exclude in settings.exclude {
// Excludes are unanchored
let exclude = if let Some(exclude) = exclude.strip_prefix("/") {
exclude.to_string()
} else {
format!("**/{exclude}").to_string()
};
let glob = parse_portable_glob(&exclude).map_err(|err| Error::PortableGlob {
field: "tool.uv.source-dist.exclude".to_string(),
source: err,
})?;
exclude_builder.add(glob);
}
let exclude_matcher = exclude_builder.build()?;
let exclude_matcher = exclude_builder
.build()
.map_err(|err| Error::GlobSetTooLarge {
field: "tool.uv.source-dist.exclude".to_string(),
source: err,
})?;
// TODO(konsti): Add files linked by pyproject.toml
for file in WalkDir::new(source_tree).into_iter().filter_entry(|dir| {
let relative = dir
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
// TODO(konsti): Also check that we're matching at least a prefix of an include matcher.
!exclude_matcher.is_match(relative)
}) {
let entry = file.map_err(|err| Error::WalkDir {
root: source_tree.to_path_buf(),
err,
})?;
for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| {
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
if !include_matcher.is_match(relative) {
.expect("walkdir starts with root")
.to_path_buf();
// Fast path: Don't descend into a directory that can't be included. This is the most
// important performance optimization, it avoids descending into directories such as
// `.venv`. While walkdir is generally cheap, we still avoid traversing large data
// directories that often exist on the top level of a project. This is especially noticeable
// on network file systems with high latencies per operation (while contiguous reading may
// still be fast).
include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative)
}) {
let entry = entry.map_err(|err| Error::WalkDir {
root: source_tree.to_path_buf(),
err,
})?;
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root")
.to_path_buf();
if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) {
trace!("Excluding {}", relative.user_display());
continue;
}
};
debug!("Including {}", relative.user_display());
let metadata = fs_err::metadata(entry.path())?;
@ -462,7 +548,7 @@ pub fn build_source_dist(
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
} else {
return Err(Error::UnsupportedFileType(
relative.to_path_buf(),
relative.clone(),
entry.file_type(),
));
}

View file

@ -1,19 +1,21 @@
use crate::pep639_glob::parse_pep639_glob;
use crate::Error;
use globset::{Glob, GlobSetBuilder};
use itertools::Itertools;
use serde::Deserialize;
use std::collections::{BTreeMap, Bound};
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use tracing::debug;
use tracing::{debug, trace};
use uv_fs::Simplified;
use uv_globfilter::parse_portable_glob;
use uv_normalize::{ExtraName, PackageName};
use uv_pep440::{Version, VersionSpecifiers};
use uv_pep508::{Requirement, VersionOrUrl};
use uv_pypi_types::{Metadata23, VerbatimParsedUrl};
use uv_warnings::warn_user_once;
use version_ranges::Ranges;
use walkdir::WalkDir;
#[derive(Debug, Error)]
pub enum ValidationError {
@ -312,27 +314,53 @@ impl PyProjectToml {
};
let mut license_files = Vec::new();
let mut license_glob_builder = GlobSetBuilder::new();
for license_glob in license_globs {
let pep639_glob = parse_pep639_glob(license_glob)
.map_err(|err| Error::Pep639Glob(license_glob.to_string(), err))?;
let absolute_glob = PathBuf::from(glob::Pattern::escape(
let pep639_glob =
parse_portable_glob(license_glob).map_err(|err| Error::PortableGlob {
field: license_glob.to_string(),
source: err,
})?;
let absolute_glob = PathBuf::from(globset::escape(
root.simplified().to_string_lossy().as_ref(),
))
.join(pep639_glob.to_string())
.to_string_lossy()
.to_string();
for license_file in glob::glob(&absolute_glob)
.map_err(|err| Error::Pattern(absolute_glob.to_string(), err))?
{
let license_file = license_file
.map_err(Error::Glob)?
.to_string_lossy()
.to_string();
if !license_files.contains(&license_file) {
license_files.push(license_file);
license_glob_builder.add(Glob::new(&absolute_glob).map_err(|err| {
Error::GlobSet {
field: "project.license-files".to_string(),
err,
}
})?);
}
let license_globs = license_glob_builder.build().map_err(|err| Error::GlobSet {
field: "project.license-files".to_string(),
err,
})?;
for entry in WalkDir::new(".") {
let entry = entry.map_err(|err| Error::WalkDir {
root: PathBuf::from("."),
err,
})?;
let relative = entry
.path()
.strip_prefix("./")
.expect("walkdir starts with root");
if !license_globs.is_match(relative) {
trace!("Not a license files match: `{}`", relative.user_display());
continue;
}
debug!("License files match: `{}`", relative.user_display());
let license_file = relative.to_string_lossy().to_string();
if !license_files.contains(&license_file) {
license_files.push(license_file);
}
}
// The glob order may be unstable
license_files.sort();

View file

@ -1,81 +0,0 @@
//! Implementation of PEP 639 cross-language restricted globs.
use glob::{Pattern, PatternError};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum Pep639GlobError {
#[error(transparent)]
PatternError(#[from] PatternError),
#[error("The parent directory operator (`..`) at position {pos} is not allowed in license file globs")]
ParentDirectory { pos: usize },
#[error("Glob contains invalid character at position {pos}: `{invalid}`")]
InvalidCharacter { pos: usize, invalid: char },
#[error("Glob contains invalid character in range at position {pos}: `{invalid}`")]
InvalidCharacterRange { pos: usize, invalid: char },
}
/// Parse a PEP 639 `license-files` glob.
///
/// The syntax is more restricted than regular globbing in Python or Rust for platform independent
/// results. Since [`glob::Pattern`] is a superset over this format, we can use it after validating
/// that no unsupported features are in the string.
///
/// From [PEP 639](https://peps.python.org/pep-0639/#add-license-files-key):
///
/// > Its value is an array of strings which MUST contain valid glob patterns,
/// > as specified below:
/// >
/// > - Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`)
/// > MUST be matched verbatim.
/// >
/// > - Special glob characters: `*`, `?`, `**` and character ranges: `[]`
/// > containing only the verbatim matched characters MUST be supported.
/// > Within `[...]`, the hyphen indicates a range (e.g. `a-z`).
/// > Hyphens at the start or end are matched literally.
/// >
/// > - Path delimiters MUST be the forward slash character (`/`).
/// > Patterns are relative to the directory containing `pyproject.toml`,
/// > therefore the leading slash character MUST NOT be used.
/// >
/// > - Parent directory indicators (`..`) MUST NOT be used.
/// >
/// > Any characters or character sequences not covered by this specification are
/// > invalid. Projects MUST NOT use such values.
/// > Tools consuming this field MAY reject invalid values with an error.
pub(crate) fn parse_pep639_glob(glob: &str) -> Result<Pattern, Pep639GlobError> {
let mut chars = glob.chars().enumerate().peekable();
// A `..` is on a parent directory indicator at the start of the string or after a directory
// separator.
let mut start_or_slash = true;
while let Some((pos, c)) = chars.next() {
if c.is_alphanumeric() || matches!(c, '_' | '-' | '*' | '?') {
start_or_slash = false;
} else if c == '.' {
if start_or_slash && matches!(chars.peek(), Some((_, '.'))) {
return Err(Pep639GlobError::ParentDirectory { pos });
}
start_or_slash = false;
} else if c == '/' {
start_or_slash = true;
} else if c == '[' {
for (pos, c) in chars.by_ref() {
// TODO: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020/98
if c.is_alphanumeric() || matches!(c, '_' | '-' | '.') {
// Allowed.
} else if c == ']' {
break;
} else {
return Err(Pep639GlobError::InvalidCharacterRange { pos, invalid: c });
}
}
start_or_slash = false;
} else {
return Err(Pep639GlobError::InvalidCharacter { pos, invalid: c });
}
}
Ok(Pattern::new(glob)?)
}
#[cfg(test)]
mod tests;

View file

@ -1,54 +0,0 @@
use super::*;
use insta::assert_snapshot;
#[test]
fn test_error() {
let parse_err = |glob| parse_pep639_glob(glob).unwrap_err().to_string();
assert_snapshot!(
parse_err(".."),
@"The parent directory operator (`..`) at position 0 is not allowed in license file globs"
);
assert_snapshot!(
parse_err("licenses/.."),
@"The parent directory operator (`..`) at position 9 is not allowed in license file globs"
);
assert_snapshot!(
parse_err("licenses/LICEN!E.txt"),
@"Glob contains invalid character at position 14: `!`"
);
assert_snapshot!(
parse_err("licenses/LICEN[!C]E.txt"),
@"Glob contains invalid character in range at position 15: `!`"
);
assert_snapshot!(
parse_err("licenses/LICEN[C?]E.txt"),
@"Glob contains invalid character in range at position 16: `?`"
);
assert_snapshot!(parse_err("******"), @"Pattern syntax error near position 2: wildcards are either regular `*` or recursive `**`");
assert_snapshot!(
parse_err(r"licenses\eula.txt"),
@r"Glob contains invalid character at position 8: `\`"
);
}
#[test]
fn test_valid() {
let cases = [
"licenses/*.txt",
"licenses/**/*.txt",
"LICEN[CS]E.txt",
"LICEN?E.txt",
"[a-z].txt",
"[a-z._-].txt",
"*/**",
"LICENSE..txt",
"LICENSE_file-1.txt",
// (google translate)
"licenses/라이센스*.txt",
"licenses/ライセンス*.txt",
"licenses/执照*.txt",
];
for case in cases {
parse_pep639_glob(case).unwrap();
}
}

View file

@ -79,7 +79,7 @@ fn test_prepare_metadata() {
.unwrap()
.path()
.strip_prefix(metadata_dir.path())
.unwrap()
.expect("walkdir starts with root")
.portable_display()
.to_string()
})

View file

@ -0,0 +1,27 @@
[package]
name = "uv-globfilter"
version = "0.1.0"
readme = "README.md"
edition.workspace = true
rust-version.workspace = true
homepage.workspace = true
documentation.workspace = true
repository.workspace = true
authors.workspace = true
license.workspace = true
[dependencies]
globset = { workspace = true }
regex = { workspace = true }
regex-automata = { workspace = true }
thiserror = { workspace = true }
tracing = { workspace = true }
walkdir = { workspace = true }
[dev-dependencies]
fs-err = "2.11.0"
insta = "1.41.1"
tempfile = "3.14.0"
[lints]
workspace = true

View file

@ -0,0 +1,34 @@
# globfilter
Portable directory walking with includes and excludes.
Motivating example: You want to allow the user to select paths within a project.
```toml
include = ["src", "License.txt", "resources/icons/*.svg"]
exclude = ["target", "/dist", ".cache", "*.tmp"]
```
When traversing the directory, you can use
`GlobDirFilter::from_globs(...)?.match_directory(&relative)` skip directories that never match in
`WalkDir`s `filter_entry`.
## Syntax
This crate supports the cross-language, restricted glob syntax from
[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key):
- Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`) are matched verbatim.
- The special glob characters are:
- `*`: Matches any number of characters except path separators
- `?`: Matches a single character except the path separator
- `**`: Matches any number of characters including path separators
- `[]`, containing only the verbatim matched characters: Matches a single of the characters
contained. Within `[...]`, the hyphen indicates a locale-agnostic range (e.g. `a-z`, order based
on Unicode code points). Hyphens at the start or end are matched literally.
- The path separator is the forward slash character (`/`). Patterns are relative to the given
directory, a leading slash character for absolute paths is not supported.
- Parent directory indicators (`..`) are not allowed.
These rules mean that matching the backslash (`\`) is forbidden, which avoid collisions with the
windows path separator.

View file

@ -0,0 +1,276 @@
use globset::{Glob, GlobSet, GlobSetBuilder};
use regex_automata::dfa;
use regex_automata::dfa::Automaton;
use std::path::{Path, MAIN_SEPARATOR, MAIN_SEPARATOR_STR};
use tracing::warn;
/// Chosen at a whim -Konsti
const DFA_SIZE_LIMIT: usize = 1_000_000;
/// Filter a directory tree traversal (walkdir) by whether any paths of a directory can be included
/// at all.
///
/// Internally, the globs are converted to a regex and then to a DFA, which unlike the globs and the
/// regex allows to check for prefix matches.
pub struct GlobDirFilter {
glob_set: GlobSet,
dfa: Option<dfa::dense::DFA<Vec<u32>>>,
}
impl GlobDirFilter {
/// The filter matches if any of the globs matches.
///
/// See <https://github.com/BurntSushi/ripgrep/discussions/2927> for the error returned.
pub fn from_globs(globs: &[Glob]) -> Result<Self, globset::Error> {
let mut glob_set_builder = GlobSetBuilder::new();
for glob in globs {
glob_set_builder.add(glob.clone());
}
let glob_set = glob_set_builder.build()?;
let regexes: Vec<_> = globs
.iter()
.map(|glob| {
let main_separator = regex::escape(MAIN_SEPARATOR_STR);
let regex = glob
.regex()
// We are using a custom DFA builder
.strip_prefix("(?-u)")
.expect("a glob is a non-unicode byte regex")
// Match windows paths if applicable
.replace('/', &main_separator);
regex
})
.collect();
let dfa_builder = dfa::dense::Builder::new()
.syntax(
// The glob regex is a byte matcher
regex_automata::util::syntax::Config::new()
.unicode(false)
.utf8(false),
)
.configure(
dfa::dense::Config::new()
.start_kind(dfa::StartKind::Anchored)
// DFA can grow exponentially, in which case we bail out
.dfa_size_limit(Some(DFA_SIZE_LIMIT))
.determinize_size_limit(Some(DFA_SIZE_LIMIT)),
)
.build_many(&regexes);
let dfa = if let Ok(dfa) = dfa_builder {
Some(dfa)
} else {
// TODO(konsti): `regex_automata::dfa::dense::BuildError` should allow asking whether
// is a size error
warn!(
"Glob expressions regex is larger than {DFA_SIZE_LIMIT} bytes, \
falling back to full directory traversal!"
);
None
};
Ok(Self { glob_set, dfa })
}
/// Whether the path (file or directory) matches any of the globs.
pub fn match_path(&self, path: &Path) -> bool {
self.glob_set.is_match(path)
}
/// Check whether a directory or any of its children can be matched by any of the globs.
///
/// This option never returns false if any child matches, but it may return true even if we
/// don't end up including any child.
pub fn match_directory(&self, path: &Path) -> bool {
let Some(dfa) = &self.dfa else {
return false;
};
// Allow the root path
if path == Path::new("") {
return true;
}
let config_anchored =
regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes);
let mut state = dfa.start_state(&config_anchored).unwrap();
// Paths aren't necessarily UTF-8, which we can gloss over since the globs match bytes only
// anyway.
let byte_path = path.as_os_str().as_encoded_bytes();
for b in byte_path {
state = dfa.next_state(state, *b);
}
// Say we're looking at a directory `foo/bar`. We want to continue if either `foo/bar` is
// a match, e.g., from `foo/*`, or a path below it can match, e.g., from `foo/bar/*`.
let eoi_state = dfa.next_eoi_state(state);
// We must not call `next_eoi_state` on the slash state, we want to only check if more
// characters (path components) are allowed, not if we're matching the `$` anchor at the
// end.
let slash_state = dfa.next_state(state, u8::try_from(MAIN_SEPARATOR).unwrap());
debug_assert!(
!dfa.is_quit_state(eoi_state) && !dfa.is_quit_state(slash_state),
"matcher is in quit state"
);
dfa.is_match_state(eoi_state) || !dfa.is_dead_state(slash_state)
}
}
#[cfg(test)]
mod tests {
use crate::glob_dir_filter::GlobDirFilter;
use crate::portable_glob::parse_portable_glob;
use std::path::{Path, MAIN_SEPARATOR};
use tempfile::tempdir;
use walkdir::WalkDir;
const FILES: [&str; 5] = [
"path1/dir1/subdir/a.txt",
"path2/dir2/subdir/a.txt",
"path3/dir3/subdir/a.txt",
"path4/dir4/subdir/a.txt",
"path5/dir5/subdir/a.txt",
];
const PATTERNS: [&str; 5] = [
// Only sufficient for descending one level
"path1/*",
// Only sufficient for descending one level
"path2/dir2",
// Sufficient for descending
"path3/dir3/subdir/a.txt",
// Sufficient for descending
"path4/**/*",
// Not sufficient for descending
"path5",
];
#[test]
fn match_directory() {
let patterns = PATTERNS.map(|pattern| parse_portable_glob(pattern).unwrap());
let matcher = GlobDirFilter::from_globs(&patterns).unwrap();
assert!(matcher.match_directory(&Path::new("path1").join("dir1")));
assert!(matcher.match_directory(&Path::new("path2").join("dir2")));
assert!(matcher.match_directory(&Path::new("path3").join("dir3")));
assert!(matcher.match_directory(&Path::new("path4").join("dir4")));
assert!(!matcher.match_directory(&Path::new("path5").join("dir5")));
}
/// Check that we skip directories that can never match.
#[test]
fn prefilter() {
let dir = tempdir().unwrap();
for file in FILES {
let file = dir.path().join(file);
fs_err::create_dir_all(file.parent().unwrap()).unwrap();
fs_err::File::create(file).unwrap();
}
let patterns = PATTERNS.map(|pattern| parse_portable_glob(pattern).unwrap());
let matcher = GlobDirFilter::from_globs(&patterns).unwrap();
// Test the prefix filtering
let mut visited: Vec<_> = WalkDir::new(dir.path())
.into_iter()
.filter_entry(|entry| {
let relative = entry
.path()
.strip_prefix(dir.path())
.expect("walkdir starts with root");
matcher.match_directory(relative)
})
.map(|entry| {
let entry = entry.unwrap();
let relative = entry
.path()
.strip_prefix(dir.path())
.expect("walkdir starts with root")
.to_str()
.unwrap()
.to_string();
// Translate windows paths back to the unix fixture
relative.replace(MAIN_SEPARATOR, "/")
})
.collect();
visited.sort();
assert_eq!(
visited,
[
"",
"path1",
"path1/dir1",
"path2",
"path2/dir2",
"path3",
"path3/dir3",
"path3/dir3/subdir",
"path3/dir3/subdir/a.txt",
"path4",
"path4/dir4",
"path4/dir4/subdir",
"path4/dir4/subdir/a.txt",
"path5"
]
);
}
/// Check that the walkdir yield the correct set of files.
#[test]
fn walk_dir() {
let dir = tempdir().unwrap();
for file in FILES {
let file = dir.path().join(file);
fs_err::create_dir_all(file.parent().unwrap()).unwrap();
fs_err::File::create(file).unwrap();
}
let patterns = PATTERNS.map(|pattern| parse_portable_glob(pattern).unwrap());
let include_matcher = GlobDirFilter::from_globs(&patterns).unwrap();
let walkdir_root = dir.path();
let mut matches: Vec<_> = WalkDir::new(walkdir_root)
.into_iter()
.filter_entry(|entry| {
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(walkdir_root)
.expect("walkdir starts with root")
.to_path_buf();
include_matcher.match_directory(&relative)
})
.filter_map(|entry| {
let entry = entry.as_ref().unwrap();
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(walkdir_root)
.expect("walkdir starts with root")
.to_path_buf();
if include_matcher.match_path(&relative) {
// Translate windows paths back to the unix fixture
Some(relative.to_str().unwrap().replace(MAIN_SEPARATOR, "/"))
} else {
None
}
})
.collect();
matches.sort();
assert_eq!(
matches,
[
"path1/dir1",
"path2/dir2",
"path3/dir3/subdir/a.txt",
"path4/dir4",
"path4/dir4/subdir",
"path4/dir4/subdir/a.txt",
"path5"
]
);
}
}

View file

@ -0,0 +1,10 @@
//! Implementation of PEP 639 cross-language restricted globs and a related directory traversal
//! prefilter.
//!
//! The goal is globs that are portable between languages and operating systems.
mod glob_dir_filter;
mod portable_glob;
pub use glob_dir_filter::GlobDirFilter;
pub use portable_glob::{check_portable_glob, parse_portable_glob, PortableGlobError};

View file

@ -0,0 +1,62 @@
#![allow(clippy::print_stdout)]
use globset::GlobSetBuilder;
use std::env::args;
use tracing::trace;
use uv_globfilter::{parse_portable_glob, GlobDirFilter};
use walkdir::WalkDir;
fn main() {
let includes = ["src/**", "pyproject.toml"];
let excludes = ["__pycache__", "*.pyc", "*.pyo"];
let mut include_globs = Vec::new();
for include in includes {
let glob = parse_portable_glob(include).unwrap();
include_globs.push(glob.clone());
}
let include_matcher = GlobDirFilter::from_globs(&include_globs).unwrap();
let mut exclude_builder = GlobSetBuilder::new();
for exclude in excludes {
// Excludes are unanchored
let exclude = if let Some(exclude) = exclude.strip_prefix("/") {
exclude.to_string()
} else {
format!("**/{exclude}").to_string()
};
let glob = parse_portable_glob(&exclude).unwrap();
exclude_builder.add(glob);
}
// https://github.com/BurntSushi/ripgrep/discussions/2927
let exclude_matcher = exclude_builder.build().unwrap();
let walkdir_root = args().next().unwrap();
for entry in WalkDir::new(&walkdir_root)
.into_iter()
.filter_entry(|entry| {
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(&walkdir_root)
.expect("walkdir starts with root")
.to_path_buf();
include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative)
})
{
let entry = entry.unwrap();
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(&walkdir_root)
.expect("walkdir starts with root")
.to_path_buf();
if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) {
trace!("Excluding: {}", relative.display());
continue;
};
println!("{}", relative.display());
}
}

View file

@ -0,0 +1,194 @@
//! Cross-language glob syntax from [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
use globset::{Glob, GlobBuilder};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum PortableGlobError {
/// Shows the failing glob in the error message.
#[error(transparent)]
GlobError(#[from] globset::Error),
#[error(
"The parent directory operator (`..`) at position {pos} is not allowed in glob: `{glob}`"
)]
ParentDirectory { glob: String, pos: usize },
#[error("Invalid character `{invalid}` at position {pos} in glob: `{glob}`")]
InvalidCharacter {
glob: String,
pos: usize,
invalid: char,
},
#[error("Invalid character `{invalid}` at position {pos} in glob: `{glob}`")]
InvalidCharacterRange {
glob: String,
pos: usize,
invalid: char,
},
#[error("Too many at stars at position {pos} in glob: `{glob}`")]
TooManyStars { glob: String, pos: usize },
}
/// Parse cross-language glob syntax from [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key):
///
/// - Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`) are matched verbatim.
/// - The special glob characters are:
/// - `*`: Matches any number of characters except path separators
/// - `?`: Matches a single character except the path separator
/// - `**`: Matches any number of characters including path separators
/// - `[]`, containing only the verbatim matched characters: Matches a single of the characters contained. Within
/// `[...]`, the hyphen indicates a locale-agnostic range (e.g. `a-z`, order based on Unicode code points). Hyphens at
/// the start or end are matched literally.
/// - The path separator is the forward slash character (`/`). Patterns are relative to the given directory, a leading slash
/// character for absolute paths is not supported.
/// - Parent directory indicators (`..`) are not allowed.
///
/// These rules mean that matching the backslash (`\`) is forbidden, which avoid collisions with the windows path separator.
pub fn parse_portable_glob(glob: &str) -> Result<Glob, PortableGlobError> {
check_portable_glob(glob)?;
Ok(GlobBuilder::new(glob).literal_separator(true).build()?)
}
/// See [`parse_portable_glob`].
pub fn check_portable_glob(glob: &str) -> Result<(), PortableGlobError> {
let mut chars = glob.chars().enumerate().peekable();
// A `..` is on a parent directory indicator at the start of the string or after a directory
// separator.
let mut start_or_slash = true;
// The number of consecutive stars before the current character.
while let Some((pos, c)) = chars.next() {
// `***` or `**literals` can be correctly represented with less stars. They are banned by
// `glob`, they are allowed by `globset` and PEP 639 is ambiguous, so we're filtering them
// out.
if c == '*' {
let mut star_run = 1;
while let Some((_, c)) = chars.peek() {
if *c == '*' {
star_run += 1;
chars.next();
} else {
break;
}
}
if star_run >= 3 {
return Err(PortableGlobError::TooManyStars {
glob: glob.to_string(),
// We don't update pos for the stars.
pos,
});
} else if star_run == 2 {
if chars.peek().is_some_and(|(_, c)| *c != '/') {
return Err(PortableGlobError::TooManyStars {
glob: glob.to_string(),
// We don't update pos for the stars.
pos,
});
}
}
start_or_slash = false;
} else if c.is_alphanumeric() || matches!(c, '_' | '-' | '?') {
start_or_slash = false;
} else if c == '.' {
if start_or_slash && matches!(chars.peek(), Some((_, '.'))) {
return Err(PortableGlobError::ParentDirectory {
pos,
glob: glob.to_string(),
});
}
start_or_slash = false;
} else if c == '/' {
start_or_slash = true;
} else if c == '[' {
for (pos, c) in chars.by_ref() {
if c.is_alphanumeric() || matches!(c, '_' | '-' | '.') {
// Allowed.
} else if c == ']' {
break;
} else {
return Err(PortableGlobError::InvalidCharacterRange {
glob: glob.to_string(),
pos,
invalid: c,
});
}
}
start_or_slash = false;
} else {
return Err(PortableGlobError::InvalidCharacter {
glob: glob.to_string(),
pos,
invalid: c,
});
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use insta::assert_snapshot;
#[test]
fn test_error() {
let parse_err = |glob| parse_portable_glob(glob).unwrap_err().to_string();
assert_snapshot!(
parse_err(".."),
@"The parent directory operator (`..`) at position 0 is not allowed in glob: `..`"
);
assert_snapshot!(
parse_err("licenses/.."),
@"The parent directory operator (`..`) at position 9 is not allowed in glob: `licenses/..`"
);
assert_snapshot!(
parse_err("licenses/LICEN!E.txt"),
@"Invalid character `!` at position 14 in glob: `licenses/LICEN!E.txt`"
);
assert_snapshot!(
parse_err("licenses/LICEN[!C]E.txt"),
@"Invalid character `!` at position 15 in glob: `licenses/LICEN[!C]E.txt`"
);
assert_snapshot!(
parse_err("licenses/LICEN[C?]E.txt"),
@"Invalid character `?` at position 16 in glob: `licenses/LICEN[C?]E.txt`"
);
assert_snapshot!(
parse_err("******"),
@"Too many at stars at position 0 in glob: `******`"
);
assert_snapshot!(
parse_err("licenses/**license"),
@"Too many at stars at position 9 in glob: `licenses/**license`"
);
assert_snapshot!(
parse_err("licenses/***/licenses.csv"),
@"Too many at stars at position 9 in glob: `licenses/***/licenses.csv`"
);
assert_snapshot!(
parse_err(r"licenses\eula.txt"),
@r"Invalid character `\` at position 8 in glob: `licenses\eula.txt`"
);
}
#[test]
fn test_valid() {
let cases = [
"licenses/*.txt",
"licenses/**/*.txt",
"LICEN[CS]E.txt",
"LICEN?E.txt",
"[a-z].txt",
"[a-z._-].txt",
"*/**",
"LICENSE..txt",
"LICENSE_file-1.txt",
// (google translate)
"licenses/라이센스*.txt",
"licenses/ライセンス*.txt",
"licenses/执照*.txt",
"src/**",
];
for case in cases {
parse_portable_glob(case).unwrap();
}
}
}

View file

@ -470,7 +470,7 @@ fn copy_wheel_files(
let entry = entry?;
let path = entry.path();
let relative = path.strip_prefix(&wheel).unwrap();
let relative = path.strip_prefix(&wheel).expect("walkdir starts with root");
let out_path = site_packages.as_ref().join(relative);
if entry.file_type().is_dir() {
@ -500,7 +500,7 @@ fn hardlink_wheel_files(
let entry = entry?;
let path = entry.path();
let relative = path.strip_prefix(&wheel).unwrap();
let relative = path.strip_prefix(&wheel).expect("walkdir starts with root");
let out_path = site_packages.as_ref().join(relative);
if entry.file_type().is_dir() {

View file

@ -312,12 +312,14 @@ pub(crate) fn move_folder_recorded(
let src = entry.path();
// This is the base path for moving to the actual target for the data
// e.g. for data it's without <..>.data/data/
let relative_to_data = src.strip_prefix(src_dir).expect("Prefix must no change");
let relative_to_data = src
.strip_prefix(src_dir)
.expect("walkdir prefix must not change");
// This is the path stored in RECORD
// e.g. for data it's with .data/data/
let relative_to_site_packages = src
.strip_prefix(site_packages)
.expect("Prefix must no change");
.expect("prefix must not change");
let target = dest_dir.join(relative_to_data);
if entry.file_type().is_dir() {
fs::create_dir_all(&target)?;

View file

@ -4,11 +4,13 @@ use crate::commands::ExitStatus;
use anyhow::Result;
use std::env;
use std::path::Path;
use uv_build_backend::SourceDistSettings;
pub(crate) fn build_sdist(sdist_directory: &Path) -> Result<ExitStatus> {
let filename = uv_build_backend::build_source_dist(
&env::current_dir()?,
sdist_directory,
SourceDistSettings::default(),
uv_version::version(),
)?;
println!("{filename}");