uv/crates/uv-build-backend/src/source_dist.rs
konsti 18a1f0d9db
Build backend: Normalize glob paths (#13465)
Unlike OS APIs, glob inclusion checks don't work when there are relative
path elements such as `./`. We normalize the path before using it for
the glob.

Fixes #13407
2025-05-15 17:19:02 +02:00

361 lines
13 KiB
Rust

use crate::metadata::DEFAULT_EXCLUDES;
use crate::wheel::build_exclude_matcher;
use crate::{
find_roots, BuildBackendSettings, DirectoryWriter, Error, FileList, ListWriter, PyProjectToml,
};
use flate2::write::GzEncoder;
use flate2::Compression;
use fs_err::File;
use globset::{Glob, GlobSet};
use std::io;
use std::io::{BufReader, Cursor};
use std::path::{Path, PathBuf};
use std::str::FromStr;
use tar::{EntryType, Header};
use tracing::{debug, trace};
use uv_distribution_filename::{SourceDistExtension, SourceDistFilename};
use uv_fs::Simplified;
use uv_globfilter::{GlobDirFilter, PortableGlobParser};
use uv_pypi_types::Identifier;
use uv_warnings::warn_user_once;
use walkdir::WalkDir;
/// Build a source distribution from the source tree and place it in the output directory.
pub fn build_source_dist(
source_tree: &Path,
source_dist_directory: &Path,
uv_version: &str,
) -> Result<SourceDistFilename, Error> {
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
let pyproject_toml = PyProjectToml::parse(&contents)?;
let filename = SourceDistFilename {
name: pyproject_toml.name().clone(),
version: pyproject_toml.version().clone(),
extension: SourceDistExtension::TarGz,
};
let source_dist_path = source_dist_directory.join(filename.to_string());
let writer = TarGzWriter::new(&source_dist_path)?;
write_source_dist(source_tree, writer, uv_version)?;
Ok(filename)
}
/// List the files that would be included in a source distribution and their origin.
pub fn list_source_dist(
source_tree: &Path,
uv_version: &str,
) -> Result<(SourceDistFilename, FileList), Error> {
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
let pyproject_toml = PyProjectToml::parse(&contents)?;
let filename = SourceDistFilename {
name: pyproject_toml.name().clone(),
version: pyproject_toml.version().clone(),
extension: SourceDistExtension::TarGz,
};
let mut files = FileList::new();
let writer = ListWriter::new(&mut files);
write_source_dist(source_tree, writer, uv_version)?;
Ok((filename, files))
}
/// Build includes and excludes for source tree walking for source dists.
fn source_dist_matcher(
pyproject_toml: &PyProjectToml,
settings: BuildBackendSettings,
) -> Result<(GlobDirFilter, GlobSet), Error> {
// File and directories to include in the source directory
let mut include_globs = Vec::new();
let mut includes: Vec<String> = settings.source_include;
// pyproject.toml is always included.
includes.push(globset::escape("pyproject.toml"));
let module_name = if let Some(module_name) = settings.module_name {
module_name
} else {
// Should never error, the rules for package names (in dist-info formatting) are stricter
// than those for identifiers
Identifier::from_str(pyproject_toml.name().as_dist_info_name().as_ref())?
};
debug!("Module name: `{:?}`", module_name);
// The wheel must not include any files included by the source distribution (at least until we
// have files generated in the source dist -> wheel build step).
let import_path = uv_fs::normalize_path(&settings.module_root.join(module_name.as_ref()))
.portable_display()
.to_string();
includes.push(format!("{}/**", globset::escape(&import_path)));
for include in includes {
let glob = PortableGlobParser::Uv
.parse(&include)
.map_err(|err| Error::PortableGlob {
field: "tool.uv.build-backend.source-include".to_string(),
source: err,
})?;
include_globs.push(glob);
}
// Include the Readme
if let Some(readme) = pyproject_toml
.readme()
.as_ref()
.and_then(|readme| readme.path())
{
let readme = uv_fs::normalize_path(readme);
trace!("Including readme at: `{}`", readme.user_display());
let readme = readme.portable_display().to_string();
let glob = Glob::new(&globset::escape(&readme)).expect("escaped globset is parseable");
include_globs.push(glob);
}
// Include the license files
for license_files in pyproject_toml.license_files_source_dist() {
trace!("Including license files at: `{license_files}`");
let glob = PortableGlobParser::Pep639
.parse(license_files)
.map_err(|err| Error::PortableGlob {
field: "project.license-files".to_string(),
source: err,
})?;
include_globs.push(glob);
}
// Include the data files
for (name, directory) in settings.data.iter() {
let directory = uv_fs::normalize_path(Path::new(directory));
trace!(
"Including data ({}) at: `{}`",
name,
directory.user_display()
);
let directory = directory.portable_display().to_string();
let glob = PortableGlobParser::Uv
.parse(&format!("{}/**", globset::escape(&directory)))
.map_err(|err| Error::PortableGlob {
field: format!("tool.uv.build-backend.data.{name}"),
source: err,
})?;
include_globs.push(glob);
}
let include_matcher =
GlobDirFilter::from_globs(&include_globs).map_err(|err| Error::GlobSetTooLarge {
field: "tool.uv.build-backend.source-include".to_string(),
source: err,
})?;
let mut excludes: Vec<String> = Vec::new();
if settings.default_excludes {
excludes.extend(DEFAULT_EXCLUDES.iter().map(ToString::to_string));
}
for exclude in settings.source_exclude {
// Avoid duplicate entries.
if !excludes.contains(&exclude) {
excludes.push(exclude);
}
}
debug!("Source dist excludes: {:?}", excludes);
let exclude_matcher = build_exclude_matcher(excludes)?;
if exclude_matcher.is_match("pyproject.toml") {
return Err(Error::PyprojectTomlExcluded);
}
Ok((include_matcher, exclude_matcher))
}
/// Shared implementation for building and listing a source distribution.
fn write_source_dist(
source_tree: &Path,
mut writer: impl DirectoryWriter,
uv_version: &str,
) -> Result<SourceDistFilename, Error> {
let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?;
let pyproject_toml = PyProjectToml::parse(&contents)?;
for warning in pyproject_toml.check_build_system(uv_version) {
warn_user_once!("{warning}");
}
let settings = pyproject_toml
.settings()
.cloned()
.unwrap_or_else(BuildBackendSettings::default);
let filename = SourceDistFilename {
name: pyproject_toml.name().clone(),
version: pyproject_toml.version().clone(),
extension: SourceDistExtension::TarGz,
};
let top_level = format!(
"{}-{}",
pyproject_toml.name().as_dist_info_name(),
pyproject_toml.version()
);
let metadata = pyproject_toml.to_metadata(source_tree)?;
let metadata_email = metadata.core_metadata_format();
debug!("Adding content files to wheel");
// Check that the source tree contains a module.
find_roots(
source_tree,
&pyproject_toml,
&settings.module_root,
settings.module_name.as_ref(),
)?;
writer.write_bytes(
&Path::new(&top_level)
.join("PKG-INFO")
.portable_display()
.to_string(),
metadata_email.as_bytes(),
)?;
let (include_matcher, exclude_matcher) = source_dist_matcher(&pyproject_toml, settings)?;
let mut files_visited = 0;
for entry in WalkDir::new(source_tree)
.sort_by_file_name()
.into_iter()
.filter_entry(|entry| {
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
// Fast path: Don't descend into a directory that can't be included. This is the most
// important performance optimization, it avoids descending into directories such as
// `.venv`. While walkdir is generally cheap, we still avoid traversing large data
// directories that often exist on the top level of a project. This is especially noticeable
// on network file systems with high latencies per operation (while contiguous reading may
// still be fast).
include_matcher.match_directory(relative) && !exclude_matcher.is_match(relative)
})
{
let entry = entry.map_err(|err| Error::WalkDir {
root: source_tree.to_path_buf(),
err,
})?;
files_visited += 1;
if files_visited > 10000 {
warn_user_once!(
"Visited more than 10,000 files for source distribution build. \
Consider using more constrained includes or more excludes."
);
}
// TODO(konsti): This should be prettier.
let relative = entry
.path()
.strip_prefix(source_tree)
.expect("walkdir starts with root");
if !include_matcher.match_path(relative) || exclude_matcher.is_match(relative) {
trace!("Excluding: `{}`", relative.user_display());
continue;
}
debug!("Including {}", relative.user_display());
if entry.file_type().is_dir() {
writer.write_directory(
&Path::new(&top_level)
.join(relative)
.portable_display()
.to_string(),
)?;
} else if entry.file_type().is_file() {
writer.write_file(
&Path::new(&top_level)
.join(relative)
.portable_display()
.to_string(),
entry.path(),
)?;
} else {
return Err(Error::UnsupportedFileType(
relative.to_path_buf(),
entry.file_type(),
));
}
}
debug!("Visited {files_visited} files for source dist build");
writer.close(&top_level)?;
Ok(filename)
}
struct TarGzWriter {
path: PathBuf,
tar: tar::Builder<GzEncoder<File>>,
}
impl TarGzWriter {
fn new(path: impl Into<PathBuf>) -> Result<Self, Error> {
let path = path.into();
let file = File::create(&path)?;
let enc = GzEncoder::new(file, Compression::default());
let tar = tar::Builder::new(enc);
Ok(Self { path, tar })
}
}
impl DirectoryWriter for TarGzWriter {
fn write_bytes(&mut self, path: &str, bytes: &[u8]) -> Result<(), Error> {
let mut header = Header::new_gnu();
header.set_size(bytes.len() as u64);
// Reasonable default to avoid 0o000 permissions, the user's umask will be applied on
// unpacking.
header.set_mode(0o644);
self.tar
.append_data(&mut header, path, Cursor::new(bytes))
.map_err(|err| Error::TarWrite(self.path.clone(), err))?;
Ok(())
}
fn write_file(&mut self, path: &str, file: &Path) -> Result<(), Error> {
let metadata = fs_err::metadata(file)?;
let mut header = Header::new_gnu();
// Preserve the executable bit, especially for scripts
#[cfg(unix)]
let executable_bit = {
use std::os::unix::fs::PermissionsExt;
file.metadata()?.permissions().mode() & 0o111 != 0
};
// Windows has no executable bit
#[cfg(not(unix))]
let executable_bit = false;
// Set reasonable defaults to avoid 0o000 permissions, while avoiding adding the exact
// filesystem permissions to the archive for reproducibility. Where applicable, the
// operating system filters the stored permission by the user's umask when unpacking.
if executable_bit {
header.set_mode(0o755);
} else {
header.set_mode(0o644);
}
header.set_size(metadata.len());
let reader = BufReader::new(File::open(file)?);
self.tar
.append_data(&mut header, path, reader)
.map_err(|err| Error::TarWrite(self.path.clone(), err))?;
Ok(())
}
fn write_directory(&mut self, directory: &str) -> Result<(), Error> {
let mut header = Header::new_gnu();
// Directories are always executable, which means they can be listed.
header.set_mode(0o755);
header.set_entry_type(EntryType::Directory);
header.set_size(0);
self.tar
.append_data(&mut header, directory, io::empty())
.map_err(|err| Error::TarWrite(self.path.clone(), err))?;
Ok(())
}
fn close(mut self, _dist_info_dir: &str) -> Result<(), Error> {
self.tar
.finish()
.map_err(|err| Error::TarWrite(self.path.clone(), err))?;
Ok(())
}
}