From ff91cf7730df7f93d2de9cf16f9c94a04b107f5c Mon Sep 17 00:00:00 2001 From: konsti Date: Mon, 2 Dec 2024 16:57:04 +0100 Subject: [PATCH] Build backend: Refactoring before list (#9558) For listing files, we first use a directory writer for source dists, which we will use for collecting the filenames instead of writing the archive in the future. I've split breaking `lib.rs` of uv-build-backend into modules into the next PR. No logic changes, only restructuring. Best reviewed commit-by-commit --- crates/uv-build-backend/src/lib.rs | 340 ++++++++++++++--------- crates/uv/src/commands/build_frontend.rs | 4 +- 2 files changed, 206 insertions(+), 138 deletions(-) diff --git a/crates/uv-build-backend/src/lib.rs b/crates/uv-build-backend/src/lib.rs index 911c1d002..80de110bf 100644 --- a/crates/uv-build-backend/src/lib.rs +++ b/crates/uv-build-backend/src/lib.rs @@ -20,7 +20,7 @@ use uv_distribution_filename::{SourceDistExtension, SourceDistFilename, WheelFil use uv_fs::Simplified; use uv_globfilter::{parse_portable_glob, GlobDirFilter, PortableGlobError}; use uv_warnings::warn_user_once; -use walkdir::{DirEntry, WalkDir}; +use walkdir::WalkDir; use zip::{CompressionMethod, ZipWriter}; #[derive(Debug, Error)] @@ -77,9 +77,10 @@ pub enum Error { TarWrite(PathBuf, #[source] io::Error), } -/// Allow dispatching between writing to a directory, writing to zip and writing to a `.tar.gz`. +/// Dispatcher between writing to a directory, writing to a zip, writing to a `.tar.gz` and +/// listing files. /// -/// All paths are string types instead of path types since wheel are portable between platforms. +/// All paths are string types instead of path types since wheels are portable between platforms. /// /// Contract: You must call close before dropping to obtain a valid output (dropping is fine in the /// error case). @@ -87,9 +88,6 @@ trait DirectoryWriter { /// Add a file with the given content. fn write_bytes(&mut self, path: &str, bytes: &[u8]) -> Result<(), Error>; - /// Add a file with the given name and return a writer for it. - fn new_writer<'slf>(&'slf mut self, path: &str) -> Result, Error>; - /// Add a local file. fn write_file(&mut self, path: &str, file: &Path) -> Result<(), Error>; @@ -129,6 +127,16 @@ impl ZipDirectoryWriter { record: Vec::new(), } } + + /// Add a file with the given name and return a writer for it. + fn new_writer<'slf>(&'slf mut self, path: &str) -> Result, Error> { + // TODO(konsti): We need to preserve permissions, at least the executable bit. + self.writer.start_file( + path, + zip::write::FileOptions::default().compression_method(self.compression), + )?; + Ok(Box::new(&mut self.writer)) + } } impl DirectoryWriter for ZipDirectoryWriter { @@ -148,15 +156,6 @@ impl DirectoryWriter for ZipDirectoryWriter { Ok(()) } - fn new_writer<'slf>(&'slf mut self, path: &str) -> Result, Error> { - // TODO(konsti): We need to preserve permissions, at least the executable bit. - self.writer.start_file( - path, - zip::write::FileOptions::default().compression_method(self.compression), - )?; - Ok(Box::new(&mut self.writer)) - } - fn write_file(&mut self, path: &str, file: &Path) -> Result<(), Error> { trace!("Adding {} from {}", path, file.user_display()); let mut reader = BufReader::new(File::open(file)?); @@ -186,24 +185,30 @@ impl DirectoryWriter for ZipDirectoryWriter { } } -struct FilesystemWrite { +struct FilesystemWriter { /// The virtualenv or metadata directory that add file paths are relative to. root: PathBuf, /// The entries in the `RECORD` file. record: Vec, } -impl FilesystemWrite { +impl FilesystemWriter { fn new(root: &Path) -> Self { Self { root: root.to_owned(), record: Vec::new(), } } + + /// Add a file with the given name and return a writer for it. + fn new_writer<'slf>(&'slf mut self, path: &str) -> Result, Error> { + trace!("Adding {}", path); + Ok(Box::new(File::create(self.root.join(path))?)) + } } /// File system writer. -impl DirectoryWriter for FilesystemWrite { +impl DirectoryWriter for FilesystemWriter { fn write_bytes(&mut self, path: &str, bytes: &[u8]) -> Result<(), Error> { trace!("Adding {}", path); let hash = format!("{:x}", Sha256::new().chain_update(bytes).finalize()); @@ -215,12 +220,6 @@ impl DirectoryWriter for FilesystemWrite { Ok(fs_err::write(self.root.join(path), bytes)?) } - - fn new_writer<'slf>(&'slf mut self, path: &str) -> Result, Error> { - trace!("Adding {}", path); - Ok(Box::new(File::create(self.root.join(path))?)) - } - fn write_file(&mut self, path: &str, file: &Path) -> Result<(), Error> { trace!("Adding {} from {}", path, file.user_display()); let mut reader = BufReader::new(File::open(file)?); @@ -249,6 +248,82 @@ impl DirectoryWriter for FilesystemWrite { } } +struct TarGzWriter { + path: PathBuf, + tar: tar::Builder>, +} + +impl TarGzWriter { + fn new(path: impl Into) -> Result { + let path = path.into(); + let file = File::create(&path)?; + let enc = GzEncoder::new(file, Compression::default()); + let tar = tar::Builder::new(enc); + Ok(Self { path, tar }) + } +} + +impl DirectoryWriter for TarGzWriter { + fn write_bytes(&mut self, path: &str, bytes: &[u8]) -> Result<(), Error> { + let mut header = Header::new_gnu(); + header.set_size(bytes.len() as u64); + // Reasonable default to avoid 0o000 permissions, the user's umask will be applied on + // unpacking. + header.set_mode(0o644); + header.set_cksum(); + self.tar + .append_data(&mut header, path, Cursor::new(bytes)) + .map_err(|err| Error::TarWrite(self.path.clone(), err))?; + Ok(()) + } + + fn write_file(&mut self, path: &str, file: &Path) -> Result<(), Error> { + let metadata = fs_err::metadata(file)?; + let mut header = Header::new_gnu(); + #[cfg(unix)] + { + // Preserve for example an executable bit. + header.set_mode(std::os::unix::fs::MetadataExt::mode(&metadata)); + } + #[cfg(not(unix))] + { + // Reasonable default to avoid 0o000 permissions, the user's umask will be applied on + // unpacking. + header.set_mode(0o644); + } + header.set_size(metadata.len()); + header.set_cksum(); + let reader = BufReader::new(File::open(file)?); + self.tar + .append_data(&mut header, path, reader) + .map_err(|err| Error::TarWrite(self.path.clone(), err))?; + Ok(()) + } + + fn write_directory(&mut self, directory: &str) -> Result<(), Error> { + let mut header = Header::new_gnu(); + // Directories are always executable, which means they can be listed. + header.set_mode(0o755); + header.set_entry_type(EntryType::Directory); + header + .set_path(directory) + .map_err(|err| Error::TarWrite(self.path.clone(), err))?; + header.set_size(0); + header.set_cksum(); + self.tar + .append(&header, io::empty()) + .map_err(|err| Error::TarWrite(self.path.clone(), err))?; + Ok(()) + } + + fn close(mut self, _dist_info_dir: &str) -> Result<(), Error> { + self.tar + .finish() + .map_err(|err| Error::TarWrite(self.path.clone(), err))?; + Ok(()) + } +} + /// An entry in the `RECORD` file. /// /// @@ -604,6 +679,25 @@ pub fn build_source_dist( source_tree: &Path, source_dist_directory: &Path, uv_version: &str, +) -> Result { + let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?; + let pyproject_toml = PyProjectToml::parse(&contents)?; + let filename = SourceDistFilename { + name: pyproject_toml.name().clone(), + version: pyproject_toml.version().clone(), + extension: SourceDistExtension::TarGz, + }; + let source_dist_path = source_dist_directory.join(filename.to_string()); + let writer = TarGzWriter::new(&source_dist_path)?; + write_source_dist(source_tree, writer, uv_version)?; + Ok(filename) +} + +/// Shared implementation for building and listing a source distribution. +fn write_source_dist( + source_tree: &Path, + mut writer: impl DirectoryWriter, + uv_version: &str, ) -> Result { let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?; let pyproject_toml = PyProjectToml::parse(&contents)?; @@ -627,25 +721,93 @@ pub fn build_source_dist( pyproject_toml.version() ); - let source_dist_path = source_dist_directory.join(filename.to_string()); - let tar_gz = File::create(&source_dist_path)?; - let enc = GzEncoder::new(tar_gz, Compression::default()); - let mut tar = tar::Builder::new(enc); - let metadata = pyproject_toml.to_metadata(source_tree)?; let metadata_email = metadata.core_metadata_format(); - let mut header = Header::new_gnu(); - header.set_size(metadata_email.bytes().len() as u64); - header.set_mode(0o644); - header.set_cksum(); - tar.append_data( - &mut header, - Path::new(&top_level).join("PKG-INFO"), - Cursor::new(metadata_email), - ) - .map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?; + writer.write_bytes( + &Path::new(&top_level) + .join("PKG-INFO") + .portable_display() + .to_string(), + metadata_email.as_bytes(), + )?; + let (include_matcher, exclude_matcher) = source_dist_matcher(&pyproject_toml, settings)?; + + let mut files_visited = 0; + for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| { + // TODO(konsti): This should be prettier. + let relative = entry + .path() + .strip_prefix(source_tree) + .expect("walkdir starts with root"); + + // Fast path: Don't descend into a directory that can't be included. This is the most + // important performance optimization, it avoids descending into directories such as + // `.venv`. While walkdir is generally cheap, we still avoid traversing large data + // directories that often exist on the top level of a project. This is especially noticeable + // on network file systems with high latencies per operation (while contiguous reading may + // still be fast). + include_matcher.match_directory(relative) && !exclude_matcher.is_match(relative) + }) { + let entry = entry.map_err(|err| Error::WalkDir { + root: source_tree.to_path_buf(), + err, + })?; + + files_visited += 1; + if files_visited > 10000 { + warn_user_once!( + "Visited more than 10,000 files for source distribution build. \ + Consider using more constrained includes or more excludes." + ); + } + // TODO(konsti): This should be prettier. + let relative = entry + .path() + .strip_prefix(source_tree) + .expect("walkdir starts with root"); + + if !include_matcher.match_path(relative) || exclude_matcher.is_match(relative) { + trace!("Excluding: `{}`", relative.user_display()); + continue; + }; + + debug!("Including {}", relative.user_display()); + if entry.file_type().is_dir() { + writer.write_directory( + &Path::new(&top_level) + .join(relative) + .portable_display() + .to_string(), + )?; + } else if entry.file_type().is_file() { + writer.write_file( + &Path::new(&top_level) + .join(relative) + .portable_display() + .to_string(), + entry.path(), + )?; + } else { + return Err(Error::UnsupportedFileType( + relative.to_path_buf(), + entry.file_type(), + )); + } + } + debug!("Visited {files_visited} files for source dist build"); + + writer.close(&top_level)?; + + Ok(filename) +} + +/// Build includes and excludes for source tree walking for source dists. +fn source_dist_matcher( + pyproject_toml: &PyProjectToml, + settings: BuildBackendSettings, +) -> Result<(GlobDirFilter, GlobSet), Error> { // File and directories to include in the source directory let mut include_globs = Vec::new(); let mut includes: Vec = settings.source_include; @@ -724,54 +886,7 @@ pub fn build_source_dist( if exclude_matcher.is_match("pyproject.toml") { return Err(Error::PyprojectTomlExcluded); } - - let mut files_visited = 0; - for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| { - // TODO(konsti): This should be prettier. - let relative = entry - .path() - .strip_prefix(source_tree) - .expect("walkdir starts with root"); - - // Fast path: Don't descend into a directory that can't be included. This is the most - // important performance optimization, it avoids descending into directories such as - // `.venv`. While walkdir is generally cheap, we still avoid traversing large data - // directories that often exist on the top level of a project. This is especially noticeable - // on network file systems with high latencies per operation (while contiguous reading may - // still be fast). - include_matcher.match_directory(relative) && !exclude_matcher.is_match(relative) - }) { - let entry = entry.map_err(|err| Error::WalkDir { - root: source_tree.to_path_buf(), - err, - })?; - - files_visited += 1; - if files_visited > 10000 { - warn_user_once!( - "Visited more than 10,000 files for source distribution build. \ - Consider using more constrained includes or more excludes." - ); - } - // TODO(konsti): This should be prettier. - let relative = entry - .path() - .strip_prefix(source_tree) - .expect("walkdir starts with root"); - - if !include_matcher.match_path(relative) || exclude_matcher.is_match(relative) { - trace!("Excluding: `{}`", relative.user_display()); - continue; - }; - - add_source_dist_entry(&mut tar, &entry, &top_level, &source_dist_path, relative)?; - } - debug!("Visited {files_visited} files for source dist build"); - - tar.finish() - .map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?; - - Ok(filename) + Ok((include_matcher, exclude_matcher)) } /// Build a globset matcher for excludes. @@ -802,55 +917,6 @@ fn build_exclude_matcher( Ok(exclude_matcher) } -/// Add a file or a directory to a source distribution. -fn add_source_dist_entry( - tar: &mut tar::Builder>, - entry: &DirEntry, - top_level: &str, - source_dist_path: &Path, - relative: &Path, -) -> Result<(), Error> { - debug!("Including {}", relative.user_display()); - - let metadata = fs_err::metadata(entry.path())?; - let mut header = Header::new_gnu(); - #[cfg(unix)] - { - header.set_mode(std::os::unix::fs::MetadataExt::mode(&metadata)); - } - #[cfg(not(unix))] - { - header.set_mode(0o644); - } - - if entry.file_type().is_dir() { - header.set_entry_type(EntryType::Directory); - header - .set_path(Path::new(&top_level).join(relative)) - .map_err(|err| Error::TarWrite(source_dist_path.to_path_buf(), err))?; - header.set_size(0); - header.set_cksum(); - tar.append(&header, io::empty()) - .map_err(|err| Error::TarWrite(source_dist_path.to_path_buf(), err))?; - Ok(()) - } else if entry.file_type().is_file() { - header.set_size(metadata.len()); - header.set_cksum(); - tar.append_data( - &mut header, - Path::new(&top_level).join(relative), - BufReader::new(File::open(entry.path())?), - ) - .map_err(|err| Error::TarWrite(source_dist_path.to_path_buf(), err))?; - Ok(()) - } else { - Err(Error::UnsupportedFileType( - relative.to_path_buf(), - entry.file_type(), - )) - } -} - /// Write the dist-info directory to the output directory without building the wheel. pub fn metadata( source_tree: &Path, @@ -876,7 +942,7 @@ pub fn metadata( "Writing metadata files to {}", metadata_directory.user_display() ); - let mut wheel_writer = FilesystemWrite::new(metadata_directory); + let mut wheel_writer = FilesystemWriter::new(metadata_directory); let dist_info_dir = write_dist_info( &mut wheel_writer, &pyproject_toml, diff --git a/crates/uv/src/commands/build_frontend.rs b/crates/uv/src/commands/build_frontend.rs index 9151e1fb7..8b5724ec7 100644 --- a/crates/uv/src/commands/build_frontend.rs +++ b/crates/uv/src/commands/build_frontend.rs @@ -7,7 +7,7 @@ use std::path::{Path, PathBuf}; use anyhow::Result; use owo_colors::OwoColorize; -use tracing::{debug, trace}; +use tracing::{debug, instrument, trace}; use uv_distribution_filename::SourceDistExtension; use uv_distribution_types::{DependencyMetadata, Index, IndexLocations, SourceDist}; use uv_install_wheel::linker::LinkMode; @@ -724,6 +724,7 @@ async fn build_package( } /// Build a source distribution, either through PEP 517 or through the fast path. +#[instrument(skip_all)] async fn build_sdist( source_tree: &Path, output_dir: &Path, @@ -781,6 +782,7 @@ async fn build_sdist( } /// Build a wheel, either through PEP 517 or through the fast path. +#[instrument(skip_all)] async fn build_wheel( source_tree: &Path, output_dir: &Path,