mirror of
https://github.com/astral-sh/uv.git
synced 2025-10-28 10:50:29 +00:00
Build backend: Switch to custom glob-walkdir implementation (#9013)
When doing a directory traversal for source dist inclusion, we want to
offer the user include and exclude options, and we want to avoid
traversing irrelevant directories. The latter is important for
performance, especially on network file systems, but also with large
data directories, or (not-included) directories with other permissions.
To support this, we introduce `GlobDirFilter`, which uses a DFA from
regex_automata to determine whether any children of a directory can be
included and skips the directory if not.
The globs are based on PEP 639. The syntax is more restricted than glob
or globset, but it's standardized. I chose it over glob or globset
because we're already using this syntax for `project.license-files` a
required by PEP 639, so it makes sense to use the same globs for all
includes (see e.g.
4f52a3bb62/pyproject.toml (L36-L48)
for example with same semantics for include and exclude)
### Semantics
Glob semantics are complex due to mixing directories and files,
expectations around simplicity and our need to exclude most of the tree
in the project from traversal. The current draft uses a syntax that
optimizes for simple default use cases for the start.
#### includes
Glob expressions which files and directories to include in the source
distribution.
Includes are anchored, which means that `pyproject.toml` includes only
`<project root>/pyproject.toml`. Use for example `assets/**/sample.csv`
to include for all
`sample.csv` files in `<project root>/assets` or any child directory. To
recursively include
all files under a directory, use a `/**` suffix, e.g. `src/**`. For
performance and
reproducibility, avoid unanchored matches such as `**/sample.csv`.
The glob syntax is the reduced portable glob from
[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
#### excludes
Glob expressions which files and directories to exclude from the
previous source
distribution includes.
Excludes are not, which means that `__pycache__` excludes all
directories named
`__pycache__` and it's children anywhere. To anchor a directory, use a
`/` prefix, e.g.,
`/dist` will exclude only `<project root>/dist`.
The glob syntax is the reduced portable glob from
[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
This commit is contained in:
parent
e310dcc7c1
commit
4ac78f673b
17 changed files with 798 additions and 195 deletions
|
|
@ -1,13 +1,10 @@
|
|||
mod metadata;
|
||||
mod pep639_glob;
|
||||
|
||||
use crate::metadata::{PyProjectToml, ValidationError};
|
||||
use crate::pep639_glob::Pep639GlobError;
|
||||
use flate2::write::GzEncoder;
|
||||
use flate2::Compression;
|
||||
use fs_err::File;
|
||||
use glob::{GlobError, PatternError};
|
||||
use globset::{Glob, GlobSetBuilder};
|
||||
use globset::GlobSetBuilder;
|
||||
use itertools::Itertools;
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::fs::FileType;
|
||||
|
|
@ -19,6 +16,7 @@ use thiserror::Error;
|
|||
use tracing::{debug, trace};
|
||||
use uv_distribution_filename::{SourceDistExtension, SourceDistFilename, WheelFilename};
|
||||
use uv_fs::Simplified;
|
||||
use uv_globfilter::{parse_portable_glob, GlobDirFilter, PortableGlobError};
|
||||
use walkdir::WalkDir;
|
||||
use zip::{CompressionMethod, ZipWriter};
|
||||
|
||||
|
|
@ -30,16 +28,26 @@ pub enum Error {
|
|||
Toml(#[from] toml::de::Error),
|
||||
#[error("Invalid pyproject.toml")]
|
||||
Validation(#[from] ValidationError),
|
||||
#[error("Invalid `project.license-files` glob expression: `{0}`")]
|
||||
Pep639Glob(String, #[source] Pep639GlobError),
|
||||
#[error("The `project.license-files` entry is not a valid glob pattern: `{0}`")]
|
||||
Pattern(String, #[source] PatternError),
|
||||
/// [`GlobError`] is a wrapped io error.
|
||||
#[error(transparent)]
|
||||
Glob(#[from] GlobError),
|
||||
#[error("Unsupported glob expression in: `{field}`")]
|
||||
PortableGlob {
|
||||
field: String,
|
||||
#[source]
|
||||
source: PortableGlobError,
|
||||
},
|
||||
/// <https://github.com/BurntSushi/ripgrep/discussions/2927>
|
||||
#[error("Glob expressions caused to large regex in: `{field}`")]
|
||||
GlobSetTooLarge {
|
||||
field: String,
|
||||
#[source]
|
||||
source: globset::Error,
|
||||
},
|
||||
/// [`globset::Error`] shows the glob that failed to parse.
|
||||
#[error(transparent)]
|
||||
GlobSet(#[from] globset::Error),
|
||||
#[error("Unsupported glob expression in: `{field}`")]
|
||||
GlobSet {
|
||||
field: String,
|
||||
#[source]
|
||||
err: globset::Error,
|
||||
},
|
||||
#[error("Failed to walk source tree: `{}`", root.user_display())]
|
||||
WalkDir {
|
||||
root: PathBuf,
|
||||
|
|
@ -322,7 +330,10 @@ pub fn build_wheel(
|
|||
err,
|
||||
})?;
|
||||
|
||||
let relative_path = entry.path().strip_prefix(&strip_root)?;
|
||||
let relative_path = entry
|
||||
.path()
|
||||
.strip_prefix(&strip_root)
|
||||
.expect("walkdir starts with root");
|
||||
let relative_path_str = relative_path
|
||||
.to_str()
|
||||
.ok_or_else(|| Error::NotUtf8Path(relative_path.to_path_buf()))?;
|
||||
|
|
@ -354,10 +365,52 @@ pub fn build_wheel(
|
|||
Ok(filename)
|
||||
}
|
||||
|
||||
/// TODO(konsti): Wire this up with actual settings and remove this struct.
|
||||
///
|
||||
/// To select which files to include in the source distribution, we first add the includes, then
|
||||
/// remove the excludes from that.
|
||||
pub struct SourceDistSettings {
|
||||
/// Glob expressions which files and directories to include in the source distribution.
|
||||
///
|
||||
/// Includes are anchored, which means that `pyproject.toml` includes only
|
||||
/// `<project root>/pyproject.toml`. Use for example `assets/**/sample.csv` to include for all
|
||||
/// `sample.csv` files in `<project root>/assets` or any child directory. To recursively include
|
||||
/// all files under a directory, use a `/**` suffix, e.g. `src/**`. For performance and
|
||||
/// reproducibility, avoid unanchored matches such as `**/sample.csv`.
|
||||
///
|
||||
/// The glob syntax is the reduced portable glob from
|
||||
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
|
||||
include: Vec<String>,
|
||||
/// Glob expressions which files and directories to exclude from the previous source
|
||||
/// distribution includes.
|
||||
///
|
||||
/// Excludes are not anchored, which means that `__pycache__` excludes all directories named
|
||||
/// `__pycache__` and it's children anywhere. To anchor a directory, use a `/` prefix, e.g.,
|
||||
/// `/dist` will exclude only `<project root>/dist`.
|
||||
///
|
||||
/// The glob syntax is the reduced portable glob from
|
||||
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
|
||||
exclude: Vec<String>,
|
||||
}
|
||||
|
||||
impl Default for SourceDistSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
include: vec!["src/**".to_string(), "pyproject.toml".to_string()],
|
||||
exclude: vec![
|
||||
"__pycache__".to_string(),
|
||||
"*.pyc".to_string(),
|
||||
"*.pyo".to_string(),
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a source distribution from the source tree and place it in the output directory.
|
||||
pub fn build_source_dist(
|
||||
source_tree: &Path,
|
||||
source_dist_directory: &Path,
|
||||
settings: SourceDistSettings,
|
||||
uv_version: &str,
|
||||
) -> Result<SourceDistFilename, Error> {
|
||||
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
|
||||
|
|
@ -392,42 +445,75 @@ pub fn build_source_dist(
|
|||
)
|
||||
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
|
||||
|
||||
let includes = ["src/**/*", "pyproject.toml"];
|
||||
let mut include_builder = GlobSetBuilder::new();
|
||||
for include in includes {
|
||||
include_builder.add(Glob::new(include)?);
|
||||
let mut include_globs = Vec::new();
|
||||
for include in settings.include {
|
||||
let glob = parse_portable_glob(&include).map_err(|err| Error::PortableGlob {
|
||||
field: "tool.uv.source-dist.include".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
include_globs.push(glob.clone());
|
||||
}
|
||||
let include_matcher = include_builder.build()?;
|
||||
let include_matcher =
|
||||
GlobDirFilter::from_globs(&include_globs).map_err(|err| Error::GlobSetTooLarge {
|
||||
field: "tool.uv.source-dist.include".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
|
||||
let excludes = ["__pycache__", "*.pyc", "*.pyo"];
|
||||
let mut exclude_builder = GlobSetBuilder::new();
|
||||
for exclude in excludes {
|
||||
exclude_builder.add(Glob::new(exclude)?);
|
||||
for exclude in settings.exclude {
|
||||
// Excludes are unanchored
|
||||
let exclude = if let Some(exclude) = exclude.strip_prefix("/") {
|
||||
exclude.to_string()
|
||||
} else {
|
||||
format!("**/{exclude}").to_string()
|
||||
};
|
||||
let glob = parse_portable_glob(&exclude).map_err(|err| Error::PortableGlob {
|
||||
field: "tool.uv.source-dist.exclude".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
exclude_builder.add(glob);
|
||||
}
|
||||
let exclude_matcher = exclude_builder.build()?;
|
||||
let exclude_matcher = exclude_builder
|
||||
.build()
|
||||
.map_err(|err| Error::GlobSetTooLarge {
|
||||
field: "tool.uv.source-dist.exclude".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
|
||||
// TODO(konsti): Add files linked by pyproject.toml
|
||||
|
||||
for file in WalkDir::new(source_tree).into_iter().filter_entry(|dir| {
|
||||
let relative = dir
|
||||
.path()
|
||||
.strip_prefix(source_tree)
|
||||
.expect("walkdir starts with root");
|
||||
// TODO(konsti): Also check that we're matching at least a prefix of an include matcher.
|
||||
!exclude_matcher.is_match(relative)
|
||||
}) {
|
||||
let entry = file.map_err(|err| Error::WalkDir {
|
||||
root: source_tree.to_path_buf(),
|
||||
err,
|
||||
})?;
|
||||
for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| {
|
||||
// TODO(konsti): This should be prettier.
|
||||
let relative = entry
|
||||
.path()
|
||||
.strip_prefix(source_tree)
|
||||
.expect("walkdir starts with root");
|
||||
if !include_matcher.is_match(relative) {
|
||||
.expect("walkdir starts with root")
|
||||
.to_path_buf();
|
||||
|
||||
// Fast path: Don't descend into a directory that can't be included. This is the most
|
||||
// important performance optimization, it avoids descending into directories such as
|
||||
// `.venv`. While walkdir is generally cheap, we still avoid traversing large data
|
||||
// directories that often exist on the top level of a project. This is especially noticeable
|
||||
// on network file systems with high latencies per operation (while contiguous reading may
|
||||
// still be fast).
|
||||
include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative)
|
||||
}) {
|
||||
let entry = entry.map_err(|err| Error::WalkDir {
|
||||
root: source_tree.to_path_buf(),
|
||||
err,
|
||||
})?;
|
||||
// TODO(konsti): This should be prettier.
|
||||
let relative = entry
|
||||
.path()
|
||||
.strip_prefix(source_tree)
|
||||
.expect("walkdir starts with root")
|
||||
.to_path_buf();
|
||||
|
||||
if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) {
|
||||
trace!("Excluding {}", relative.user_display());
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
debug!("Including {}", relative.user_display());
|
||||
|
||||
let metadata = fs_err::metadata(entry.path())?;
|
||||
|
|
@ -462,7 +548,7 @@ pub fn build_source_dist(
|
|||
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
|
||||
} else {
|
||||
return Err(Error::UnsupportedFileType(
|
||||
relative.to_path_buf(),
|
||||
relative.clone(),
|
||||
entry.file_type(),
|
||||
));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,19 +1,21 @@
|
|||
use crate::pep639_glob::parse_pep639_glob;
|
||||
use crate::Error;
|
||||
use globset::{Glob, GlobSetBuilder};
|
||||
use itertools::Itertools;
|
||||
use serde::Deserialize;
|
||||
use std::collections::{BTreeMap, Bound};
|
||||
use std::ffi::OsStr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use tracing::debug;
|
||||
use tracing::{debug, trace};
|
||||
use uv_fs::Simplified;
|
||||
use uv_globfilter::parse_portable_glob;
|
||||
use uv_normalize::{ExtraName, PackageName};
|
||||
use uv_pep440::{Version, VersionSpecifiers};
|
||||
use uv_pep508::{Requirement, VersionOrUrl};
|
||||
use uv_pypi_types::{Metadata23, VerbatimParsedUrl};
|
||||
use uv_warnings::warn_user_once;
|
||||
use version_ranges::Ranges;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ValidationError {
|
||||
|
|
@ -312,27 +314,53 @@ impl PyProjectToml {
|
|||
};
|
||||
|
||||
let mut license_files = Vec::new();
|
||||
let mut license_glob_builder = GlobSetBuilder::new();
|
||||
for license_glob in license_globs {
|
||||
let pep639_glob = parse_pep639_glob(license_glob)
|
||||
.map_err(|err| Error::Pep639Glob(license_glob.to_string(), err))?;
|
||||
let absolute_glob = PathBuf::from(glob::Pattern::escape(
|
||||
let pep639_glob =
|
||||
parse_portable_glob(license_glob).map_err(|err| Error::PortableGlob {
|
||||
field: license_glob.to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
let absolute_glob = PathBuf::from(globset::escape(
|
||||
root.simplified().to_string_lossy().as_ref(),
|
||||
))
|
||||
.join(pep639_glob.to_string())
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
for license_file in glob::glob(&absolute_glob)
|
||||
.map_err(|err| Error::Pattern(absolute_glob.to_string(), err))?
|
||||
{
|
||||
let license_file = license_file
|
||||
.map_err(Error::Glob)?
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
if !license_files.contains(&license_file) {
|
||||
license_files.push(license_file);
|
||||
license_glob_builder.add(Glob::new(&absolute_glob).map_err(|err| {
|
||||
Error::GlobSet {
|
||||
field: "project.license-files".to_string(),
|
||||
err,
|
||||
}
|
||||
})?);
|
||||
}
|
||||
let license_globs = license_glob_builder.build().map_err(|err| Error::GlobSet {
|
||||
field: "project.license-files".to_string(),
|
||||
err,
|
||||
})?;
|
||||
|
||||
for entry in WalkDir::new(".") {
|
||||
let entry = entry.map_err(|err| Error::WalkDir {
|
||||
root: PathBuf::from("."),
|
||||
err,
|
||||
})?;
|
||||
let relative = entry
|
||||
.path()
|
||||
.strip_prefix("./")
|
||||
.expect("walkdir starts with root");
|
||||
if !license_globs.is_match(relative) {
|
||||
trace!("Not a license files match: `{}`", relative.user_display());
|
||||
continue;
|
||||
}
|
||||
|
||||
debug!("License files match: `{}`", relative.user_display());
|
||||
let license_file = relative.to_string_lossy().to_string();
|
||||
|
||||
if !license_files.contains(&license_file) {
|
||||
license_files.push(license_file);
|
||||
}
|
||||
}
|
||||
|
||||
// The glob order may be unstable
|
||||
license_files.sort();
|
||||
|
||||
|
|
|
|||
|
|
@ -1,81 +0,0 @@
|
|||
//! Implementation of PEP 639 cross-language restricted globs.
|
||||
|
||||
use glob::{Pattern, PatternError};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum Pep639GlobError {
|
||||
#[error(transparent)]
|
||||
PatternError(#[from] PatternError),
|
||||
#[error("The parent directory operator (`..`) at position {pos} is not allowed in license file globs")]
|
||||
ParentDirectory { pos: usize },
|
||||
#[error("Glob contains invalid character at position {pos}: `{invalid}`")]
|
||||
InvalidCharacter { pos: usize, invalid: char },
|
||||
#[error("Glob contains invalid character in range at position {pos}: `{invalid}`")]
|
||||
InvalidCharacterRange { pos: usize, invalid: char },
|
||||
}
|
||||
|
||||
/// Parse a PEP 639 `license-files` glob.
|
||||
///
|
||||
/// The syntax is more restricted than regular globbing in Python or Rust for platform independent
|
||||
/// results. Since [`glob::Pattern`] is a superset over this format, we can use it after validating
|
||||
/// that no unsupported features are in the string.
|
||||
///
|
||||
/// From [PEP 639](https://peps.python.org/pep-0639/#add-license-files-key):
|
||||
///
|
||||
/// > Its value is an array of strings which MUST contain valid glob patterns,
|
||||
/// > as specified below:
|
||||
/// >
|
||||
/// > - Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`)
|
||||
/// > MUST be matched verbatim.
|
||||
/// >
|
||||
/// > - Special glob characters: `*`, `?`, `**` and character ranges: `[]`
|
||||
/// > containing only the verbatim matched characters MUST be supported.
|
||||
/// > Within `[...]`, the hyphen indicates a range (e.g. `a-z`).
|
||||
/// > Hyphens at the start or end are matched literally.
|
||||
/// >
|
||||
/// > - Path delimiters MUST be the forward slash character (`/`).
|
||||
/// > Patterns are relative to the directory containing `pyproject.toml`,
|
||||
/// > therefore the leading slash character MUST NOT be used.
|
||||
/// >
|
||||
/// > - Parent directory indicators (`..`) MUST NOT be used.
|
||||
/// >
|
||||
/// > Any characters or character sequences not covered by this specification are
|
||||
/// > invalid. Projects MUST NOT use such values.
|
||||
/// > Tools consuming this field MAY reject invalid values with an error.
|
||||
pub(crate) fn parse_pep639_glob(glob: &str) -> Result<Pattern, Pep639GlobError> {
|
||||
let mut chars = glob.chars().enumerate().peekable();
|
||||
// A `..` is on a parent directory indicator at the start of the string or after a directory
|
||||
// separator.
|
||||
let mut start_or_slash = true;
|
||||
while let Some((pos, c)) = chars.next() {
|
||||
if c.is_alphanumeric() || matches!(c, '_' | '-' | '*' | '?') {
|
||||
start_or_slash = false;
|
||||
} else if c == '.' {
|
||||
if start_or_slash && matches!(chars.peek(), Some((_, '.'))) {
|
||||
return Err(Pep639GlobError::ParentDirectory { pos });
|
||||
}
|
||||
start_or_slash = false;
|
||||
} else if c == '/' {
|
||||
start_or_slash = true;
|
||||
} else if c == '[' {
|
||||
for (pos, c) in chars.by_ref() {
|
||||
// TODO: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020/98
|
||||
if c.is_alphanumeric() || matches!(c, '_' | '-' | '.') {
|
||||
// Allowed.
|
||||
} else if c == ']' {
|
||||
break;
|
||||
} else {
|
||||
return Err(Pep639GlobError::InvalidCharacterRange { pos, invalid: c });
|
||||
}
|
||||
}
|
||||
start_or_slash = false;
|
||||
} else {
|
||||
return Err(Pep639GlobError::InvalidCharacter { pos, invalid: c });
|
||||
}
|
||||
}
|
||||
Ok(Pattern::new(glob)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
use super::*;
|
||||
use insta::assert_snapshot;
|
||||
|
||||
#[test]
|
||||
fn test_error() {
|
||||
let parse_err = |glob| parse_pep639_glob(glob).unwrap_err().to_string();
|
||||
assert_snapshot!(
|
||||
parse_err(".."),
|
||||
@"The parent directory operator (`..`) at position 0 is not allowed in license file globs"
|
||||
);
|
||||
assert_snapshot!(
|
||||
parse_err("licenses/.."),
|
||||
@"The parent directory operator (`..`) at position 9 is not allowed in license file globs"
|
||||
);
|
||||
assert_snapshot!(
|
||||
parse_err("licenses/LICEN!E.txt"),
|
||||
@"Glob contains invalid character at position 14: `!`"
|
||||
);
|
||||
assert_snapshot!(
|
||||
parse_err("licenses/LICEN[!C]E.txt"),
|
||||
@"Glob contains invalid character in range at position 15: `!`"
|
||||
);
|
||||
assert_snapshot!(
|
||||
parse_err("licenses/LICEN[C?]E.txt"),
|
||||
@"Glob contains invalid character in range at position 16: `?`"
|
||||
);
|
||||
assert_snapshot!(parse_err("******"), @"Pattern syntax error near position 2: wildcards are either regular `*` or recursive `**`");
|
||||
assert_snapshot!(
|
||||
parse_err(r"licenses\eula.txt"),
|
||||
@r"Glob contains invalid character at position 8: `\`"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid() {
|
||||
let cases = [
|
||||
"licenses/*.txt",
|
||||
"licenses/**/*.txt",
|
||||
"LICEN[CS]E.txt",
|
||||
"LICEN?E.txt",
|
||||
"[a-z].txt",
|
||||
"[a-z._-].txt",
|
||||
"*/**",
|
||||
"LICENSE..txt",
|
||||
"LICENSE_file-1.txt",
|
||||
// (google translate)
|
||||
"licenses/라이센스*.txt",
|
||||
"licenses/ライセンス*.txt",
|
||||
"licenses/执照*.txt",
|
||||
];
|
||||
for case in cases {
|
||||
parse_pep639_glob(case).unwrap();
|
||||
}
|
||||
}
|
||||
|
|
@ -79,7 +79,7 @@ fn test_prepare_metadata() {
|
|||
.unwrap()
|
||||
.path()
|
||||
.strip_prefix(metadata_dir.path())
|
||||
.unwrap()
|
||||
.expect("walkdir starts with root")
|
||||
.portable_display()
|
||||
.to_string()
|
||||
})
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue