mirror of
https://github.com/astral-sh/uv.git
synced 2025-10-14 20:39:37 +00:00
Build backend: Switch to custom glob-walkdir implementation (#9013)
When doing a directory traversal for source dist inclusion, we want to
offer the user include and exclude options, and we want to avoid
traversing irrelevant directories. The latter is important for
performance, especially on network file systems, but also with large
data directories, or (not-included) directories with other permissions.
To support this, we introduce `GlobDirFilter`, which uses a DFA from
regex_automata to determine whether any children of a directory can be
included and skips the directory if not.
The globs are based on PEP 639. The syntax is more restricted than glob
or globset, but it's standardized. I chose it over glob or globset
because we're already using this syntax for `project.license-files` a
required by PEP 639, so it makes sense to use the same globs for all
includes (see e.g.
4f52a3bb62/pyproject.toml (L36-L48)
for example with same semantics for include and exclude)
### Semantics
Glob semantics are complex due to mixing directories and files,
expectations around simplicity and our need to exclude most of the tree
in the project from traversal. The current draft uses a syntax that
optimizes for simple default use cases for the start.
#### includes
Glob expressions which files and directories to include in the source
distribution.
Includes are anchored, which means that `pyproject.toml` includes only
`<project root>/pyproject.toml`. Use for example `assets/**/sample.csv`
to include for all
`sample.csv` files in `<project root>/assets` or any child directory. To
recursively include
all files under a directory, use a `/**` suffix, e.g. `src/**`. For
performance and
reproducibility, avoid unanchored matches such as `**/sample.csv`.
The glob syntax is the reduced portable glob from
[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
#### excludes
Glob expressions which files and directories to exclude from the
previous source
distribution includes.
Excludes are not, which means that `__pycache__` excludes all
directories named
`__pycache__` and it's children anywhere. To anchor a directory, use a
`/` prefix, e.g.,
`/dist` will exclude only `<project root>/dist`.
The glob syntax is the reduced portable glob from
[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
This commit is contained in:
parent
e310dcc7c1
commit
4ac78f673b
17 changed files with 798 additions and 195 deletions
|
@ -1,13 +1,10 @@
|
|||
mod metadata;
|
||||
mod pep639_glob;
|
||||
|
||||
use crate::metadata::{PyProjectToml, ValidationError};
|
||||
use crate::pep639_glob::Pep639GlobError;
|
||||
use flate2::write::GzEncoder;
|
||||
use flate2::Compression;
|
||||
use fs_err::File;
|
||||
use glob::{GlobError, PatternError};
|
||||
use globset::{Glob, GlobSetBuilder};
|
||||
use globset::GlobSetBuilder;
|
||||
use itertools::Itertools;
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::fs::FileType;
|
||||
|
@ -19,6 +16,7 @@ use thiserror::Error;
|
|||
use tracing::{debug, trace};
|
||||
use uv_distribution_filename::{SourceDistExtension, SourceDistFilename, WheelFilename};
|
||||
use uv_fs::Simplified;
|
||||
use uv_globfilter::{parse_portable_glob, GlobDirFilter, PortableGlobError};
|
||||
use walkdir::WalkDir;
|
||||
use zip::{CompressionMethod, ZipWriter};
|
||||
|
||||
|
@ -30,16 +28,26 @@ pub enum Error {
|
|||
Toml(#[from] toml::de::Error),
|
||||
#[error("Invalid pyproject.toml")]
|
||||
Validation(#[from] ValidationError),
|
||||
#[error("Invalid `project.license-files` glob expression: `{0}`")]
|
||||
Pep639Glob(String, #[source] Pep639GlobError),
|
||||
#[error("The `project.license-files` entry is not a valid glob pattern: `{0}`")]
|
||||
Pattern(String, #[source] PatternError),
|
||||
/// [`GlobError`] is a wrapped io error.
|
||||
#[error(transparent)]
|
||||
Glob(#[from] GlobError),
|
||||
#[error("Unsupported glob expression in: `{field}`")]
|
||||
PortableGlob {
|
||||
field: String,
|
||||
#[source]
|
||||
source: PortableGlobError,
|
||||
},
|
||||
/// <https://github.com/BurntSushi/ripgrep/discussions/2927>
|
||||
#[error("Glob expressions caused to large regex in: `{field}`")]
|
||||
GlobSetTooLarge {
|
||||
field: String,
|
||||
#[source]
|
||||
source: globset::Error,
|
||||
},
|
||||
/// [`globset::Error`] shows the glob that failed to parse.
|
||||
#[error(transparent)]
|
||||
GlobSet(#[from] globset::Error),
|
||||
#[error("Unsupported glob expression in: `{field}`")]
|
||||
GlobSet {
|
||||
field: String,
|
||||
#[source]
|
||||
err: globset::Error,
|
||||
},
|
||||
#[error("Failed to walk source tree: `{}`", root.user_display())]
|
||||
WalkDir {
|
||||
root: PathBuf,
|
||||
|
@ -322,7 +330,10 @@ pub fn build_wheel(
|
|||
err,
|
||||
})?;
|
||||
|
||||
let relative_path = entry.path().strip_prefix(&strip_root)?;
|
||||
let relative_path = entry
|
||||
.path()
|
||||
.strip_prefix(&strip_root)
|
||||
.expect("walkdir starts with root");
|
||||
let relative_path_str = relative_path
|
||||
.to_str()
|
||||
.ok_or_else(|| Error::NotUtf8Path(relative_path.to_path_buf()))?;
|
||||
|
@ -354,10 +365,52 @@ pub fn build_wheel(
|
|||
Ok(filename)
|
||||
}
|
||||
|
||||
/// TODO(konsti): Wire this up with actual settings and remove this struct.
|
||||
///
|
||||
/// To select which files to include in the source distribution, we first add the includes, then
|
||||
/// remove the excludes from that.
|
||||
pub struct SourceDistSettings {
|
||||
/// Glob expressions which files and directories to include in the source distribution.
|
||||
///
|
||||
/// Includes are anchored, which means that `pyproject.toml` includes only
|
||||
/// `<project root>/pyproject.toml`. Use for example `assets/**/sample.csv` to include for all
|
||||
/// `sample.csv` files in `<project root>/assets` or any child directory. To recursively include
|
||||
/// all files under a directory, use a `/**` suffix, e.g. `src/**`. For performance and
|
||||
/// reproducibility, avoid unanchored matches such as `**/sample.csv`.
|
||||
///
|
||||
/// The glob syntax is the reduced portable glob from
|
||||
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
|
||||
include: Vec<String>,
|
||||
/// Glob expressions which files and directories to exclude from the previous source
|
||||
/// distribution includes.
|
||||
///
|
||||
/// Excludes are not anchored, which means that `__pycache__` excludes all directories named
|
||||
/// `__pycache__` and it's children anywhere. To anchor a directory, use a `/` prefix, e.g.,
|
||||
/// `/dist` will exclude only `<project root>/dist`.
|
||||
///
|
||||
/// The glob syntax is the reduced portable glob from
|
||||
/// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key).
|
||||
exclude: Vec<String>,
|
||||
}
|
||||
|
||||
impl Default for SourceDistSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
include: vec!["src/**".to_string(), "pyproject.toml".to_string()],
|
||||
exclude: vec![
|
||||
"__pycache__".to_string(),
|
||||
"*.pyc".to_string(),
|
||||
"*.pyo".to_string(),
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a source distribution from the source tree and place it in the output directory.
|
||||
pub fn build_source_dist(
|
||||
source_tree: &Path,
|
||||
source_dist_directory: &Path,
|
||||
settings: SourceDistSettings,
|
||||
uv_version: &str,
|
||||
) -> Result<SourceDistFilename, Error> {
|
||||
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
|
||||
|
@ -392,42 +445,75 @@ pub fn build_source_dist(
|
|||
)
|
||||
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
|
||||
|
||||
let includes = ["src/**/*", "pyproject.toml"];
|
||||
let mut include_builder = GlobSetBuilder::new();
|
||||
for include in includes {
|
||||
include_builder.add(Glob::new(include)?);
|
||||
let mut include_globs = Vec::new();
|
||||
for include in settings.include {
|
||||
let glob = parse_portable_glob(&include).map_err(|err| Error::PortableGlob {
|
||||
field: "tool.uv.source-dist.include".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
include_globs.push(glob.clone());
|
||||
}
|
||||
let include_matcher = include_builder.build()?;
|
||||
let include_matcher =
|
||||
GlobDirFilter::from_globs(&include_globs).map_err(|err| Error::GlobSetTooLarge {
|
||||
field: "tool.uv.source-dist.include".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
|
||||
let excludes = ["__pycache__", "*.pyc", "*.pyo"];
|
||||
let mut exclude_builder = GlobSetBuilder::new();
|
||||
for exclude in excludes {
|
||||
exclude_builder.add(Glob::new(exclude)?);
|
||||
for exclude in settings.exclude {
|
||||
// Excludes are unanchored
|
||||
let exclude = if let Some(exclude) = exclude.strip_prefix("/") {
|
||||
exclude.to_string()
|
||||
} else {
|
||||
format!("**/{exclude}").to_string()
|
||||
};
|
||||
let glob = parse_portable_glob(&exclude).map_err(|err| Error::PortableGlob {
|
||||
field: "tool.uv.source-dist.exclude".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
exclude_builder.add(glob);
|
||||
}
|
||||
let exclude_matcher = exclude_builder.build()?;
|
||||
let exclude_matcher = exclude_builder
|
||||
.build()
|
||||
.map_err(|err| Error::GlobSetTooLarge {
|
||||
field: "tool.uv.source-dist.exclude".to_string(),
|
||||
source: err,
|
||||
})?;
|
||||
|
||||
// TODO(konsti): Add files linked by pyproject.toml
|
||||
|
||||
for file in WalkDir::new(source_tree).into_iter().filter_entry(|dir| {
|
||||
let relative = dir
|
||||
.path()
|
||||
.strip_prefix(source_tree)
|
||||
.expect("walkdir starts with root");
|
||||
// TODO(konsti): Also check that we're matching at least a prefix of an include matcher.
|
||||
!exclude_matcher.is_match(relative)
|
||||
}) {
|
||||
let entry = file.map_err(|err| Error::WalkDir {
|
||||
root: source_tree.to_path_buf(),
|
||||
err,
|
||||
})?;
|
||||
for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| {
|
||||
// TODO(konsti): This should be prettier.
|
||||
let relative = entry
|
||||
.path()
|
||||
.strip_prefix(source_tree)
|
||||
.expect("walkdir starts with root");
|
||||
if !include_matcher.is_match(relative) {
|
||||
.expect("walkdir starts with root")
|
||||
.to_path_buf();
|
||||
|
||||
// Fast path: Don't descend into a directory that can't be included. This is the most
|
||||
// important performance optimization, it avoids descending into directories such as
|
||||
// `.venv`. While walkdir is generally cheap, we still avoid traversing large data
|
||||
// directories that often exist on the top level of a project. This is especially noticeable
|
||||
// on network file systems with high latencies per operation (while contiguous reading may
|
||||
// still be fast).
|
||||
include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative)
|
||||
}) {
|
||||
let entry = entry.map_err(|err| Error::WalkDir {
|
||||
root: source_tree.to_path_buf(),
|
||||
err,
|
||||
})?;
|
||||
// TODO(konsti): This should be prettier.
|
||||
let relative = entry
|
||||
.path()
|
||||
.strip_prefix(source_tree)
|
||||
.expect("walkdir starts with root")
|
||||
.to_path_buf();
|
||||
|
||||
if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) {
|
||||
trace!("Excluding {}", relative.user_display());
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
debug!("Including {}", relative.user_display());
|
||||
|
||||
let metadata = fs_err::metadata(entry.path())?;
|
||||
|
@ -462,7 +548,7 @@ pub fn build_source_dist(
|
|||
.map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?;
|
||||
} else {
|
||||
return Err(Error::UnsupportedFileType(
|
||||
relative.to_path_buf(),
|
||||
relative.clone(),
|
||||
entry.file_type(),
|
||||
));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue