From a267a501b6aae5e0a7fcff87f75870cc321dae76 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Sun, 10 Mar 2024 08:39:28 -0700 Subject: [PATCH] Add `Seek` fallback for zip files (#2320) ## Summary Some zip files can't be streamed; in particular, `rs-async-zip` doesn't support data descriptors right now (though it may in the future). This PR adds a fallback path for such zips that downloads the entire zip file to disk, then unzips it from disk (which gives us `Seek`). Closes https://github.com/astral-sh/uv/issues/2216. ## Test Plan `cargo run pip install --extra-index-url https://buf.build/gen/python hashb_foxglove_protocolbuffers_python==25.3.0.1.20240226043130+465630478360 --force-reinstall -n` --- crates/install-wheel-rs/src/lib.rs | 134 +----------- crates/install-wheel-rs/src/metadata.rs | 197 ++++++++++++++++++ crates/uv-client/src/error.rs | 2 +- crates/uv-client/src/registry_client.rs | 4 +- crates/uv-client/src/remote_metadata.rs | 4 +- .../src/distribution_database.rs | 168 +++++++++++++-- crates/uv-distribution/src/download.rs | 34 +++ crates/uv-distribution/src/error.rs | 2 +- crates/uv-distribution/src/source/mod.rs | 6 +- crates/uv-extract/src/error.rs | 16 +- crates/uv-extract/src/lib.rs | 1 + crates/uv-extract/src/seek.rs | 115 ++++++++++ crates/uv/tests/pip_compile.rs | 38 ++++ crates/uv/tests/pip_sync.rs | 30 +++ 14 files changed, 591 insertions(+), 160 deletions(-) create mode 100644 crates/install-wheel-rs/src/metadata.rs create mode 100644 crates/uv-extract/src/seek.rs diff --git a/crates/install-wheel-rs/src/lib.rs b/crates/install-wheel-rs/src/lib.rs index 6f5f41c92..76fe1b9c3 100644 --- a/crates/install-wheel-rs/src/lib.rs +++ b/crates/install-wheel-rs/src/lib.rs @@ -1,16 +1,13 @@ //! Takes a wheel and installs it into a venv. use std::io; -use std::io::{Read, Seek}; + use std::path::PathBuf; -use std::str::FromStr; use platform_info::PlatformInfoError; use thiserror::Error; use zip::result::ZipError; -use zip::ZipArchive; -use distribution_filename::WheelFilename; use pep440_rs::Version; use platform_host::{Arch, Os}; use pypi_types::Scheme; @@ -19,6 +16,7 @@ use uv_fs::Simplified; use uv_normalize::PackageName; pub mod linker; +pub mod metadata; mod record; mod script; mod uninstall; @@ -99,131 +97,3 @@ pub enum Error { #[error("Wheel version does not match filename: {0} != {1}")] MismatchedVersion(Version, Version), } - -/// Returns `true` if the file is a `METADATA` file in a `dist-info` directory that matches the -/// wheel filename. -pub fn is_metadata_entry(path: &str, filename: &WheelFilename) -> bool { - let Some((dist_info_dir, file)) = path.split_once('/') else { - return false; - }; - if file != "METADATA" { - return false; - } - let Some(dir_stem) = dist_info_dir.strip_suffix(".dist-info") else { - return false; - }; - let Some((name, version)) = dir_stem.rsplit_once('-') else { - return false; - }; - let Ok(name) = PackageName::from_str(name) else { - return false; - }; - if name != filename.name { - return false; - } - let Ok(version) = Version::from_str(version) else { - return false; - }; - if version != filename.version { - return false; - } - true -} - -/// Find the `dist-info` directory from a list of files. -/// -/// The metadata name may be uppercase, while the wheel and dist info names are lowercase, or -/// the metadata name and the dist info name are lowercase, while the wheel name is uppercase. -/// Either way, we just search the wheel for the name. -/// -/// Returns the dist info dir prefix without the `.dist-info` extension. -/// -/// Reference implementation: -pub fn find_dist_info<'a, T: Copy>( - filename: &WheelFilename, - files: impl Iterator, -) -> Result<(T, &'a str), Error> { - let metadatas: Vec<_> = files - .filter_map(|(payload, path)| { - let (dist_info_dir, file) = path.split_once('/')?; - if file != "METADATA" { - return None; - } - - let dir_stem = dist_info_dir.strip_suffix(".dist-info")?; - let (name, version) = dir_stem.rsplit_once('-')?; - if PackageName::from_str(name).ok()? != filename.name { - return None; - } - - if Version::from_str(version).ok()? != filename.version { - return None; - } - - Some((payload, dir_stem)) - }) - .collect(); - let (payload, dist_info_prefix) = match metadatas[..] { - [] => { - return Err(Error::MissingDistInfo); - } - [(payload, path)] => (payload, path), - _ => { - return Err(Error::MultipleDistInfo( - metadatas - .into_iter() - .map(|(_, dist_info_dir)| dist_info_dir.to_string()) - .collect::>() - .join(", "), - )); - } - }; - Ok((payload, dist_info_prefix)) -} - -/// Given an archive, read the `dist-info` metadata into a buffer. -pub fn read_dist_info( - filename: &WheelFilename, - archive: &mut ZipArchive, -) -> Result, Error> { - let dist_info_prefix = - find_dist_info(filename, archive.file_names().map(|name| (name, name)))?.1; - - let mut file = archive - .by_name(&format!("{dist_info_prefix}.dist-info/METADATA")) - .map_err(|err| Error::Zip(filename.to_string(), err))?; - - #[allow(clippy::cast_possible_truncation)] - let mut buffer = Vec::with_capacity(file.size() as usize); - file.read_to_end(&mut buffer)?; - - Ok(buffer) -} - -#[cfg(test)] -mod test { - use std::str::FromStr; - - use distribution_filename::WheelFilename; - - use crate::find_dist_info; - - #[test] - fn test_dot_in_name() { - let files = [ - "mastodon/Mastodon.py", - "mastodon/__init__.py", - "mastodon/streaming.py", - "Mastodon.py-1.5.1.dist-info/DESCRIPTION.rst", - "Mastodon.py-1.5.1.dist-info/metadata.json", - "Mastodon.py-1.5.1.dist-info/top_level.txt", - "Mastodon.py-1.5.1.dist-info/WHEEL", - "Mastodon.py-1.5.1.dist-info/METADATA", - "Mastodon.py-1.5.1.dist-info/RECORD", - ]; - let filename = WheelFilename::from_str("Mastodon.py-1.5.1-py2.py3-none-any.whl").unwrap(); - let (_, dist_info_prefix) = - find_dist_info(&filename, files.into_iter().map(|file| (file, file))).unwrap(); - assert_eq!(dist_info_prefix, "Mastodon.py-1.5.1"); - } -} diff --git a/crates/install-wheel-rs/src/metadata.rs b/crates/install-wheel-rs/src/metadata.rs new file mode 100644 index 000000000..0b3bd1a3d --- /dev/null +++ b/crates/install-wheel-rs/src/metadata.rs @@ -0,0 +1,197 @@ +use std::io::{Read, Seek}; +use std::path::Path; +use std::str::FromStr; + +use zip::ZipArchive; + +use distribution_filename::WheelFilename; +use pep440_rs::Version; +use uv_normalize::PackageName; + +use crate::Error; + +/// Returns `true` if the file is a `METADATA` file in a `.dist-info` directory that matches the +/// wheel filename. +pub fn is_metadata_entry(path: &str, filename: &WheelFilename) -> bool { + let Some((dist_info_dir, file)) = path.split_once('/') else { + return false; + }; + if file != "METADATA" { + return false; + } + let Some(dir_stem) = dist_info_dir.strip_suffix(".dist-info") else { + return false; + }; + let Some((name, version)) = dir_stem.rsplit_once('-') else { + return false; + }; + let Ok(name) = PackageName::from_str(name) else { + return false; + }; + if name != filename.name { + return false; + } + let Ok(version) = Version::from_str(version) else { + return false; + }; + if version != filename.version { + return false; + } + true +} + +/// Find the `.dist-info` directory in a zipped wheel. +/// +/// The metadata name may be uppercase, while the wheel and dist info names are lowercase, or +/// the metadata name and the dist info name are lowercase, while the wheel name is uppercase. +/// Either way, we just search the wheel for the name. +/// +/// Returns the dist info dir prefix without the `.dist-info` extension. +/// +/// Reference implementation: +pub fn find_archive_dist_info<'a, T: Copy>( + filename: &WheelFilename, + files: impl Iterator, +) -> Result<(T, &'a str), Error> { + let metadatas: Vec<_> = files + .filter_map(|(payload, path)| { + let (dist_info_dir, file) = path.split_once('/')?; + if file != "METADATA" { + return None; + } + + let dir_stem = dist_info_dir.strip_suffix(".dist-info")?; + let (name, version) = dir_stem.rsplit_once('-')?; + if PackageName::from_str(name).ok()? != filename.name { + return None; + } + + if Version::from_str(version).ok()? != filename.version { + return None; + } + + Some((payload, dir_stem)) + }) + .collect(); + let (payload, dist_info_prefix) = match metadatas[..] { + [] => { + return Err(Error::MissingDistInfo); + } + [(payload, path)] => (payload, path), + _ => { + return Err(Error::MultipleDistInfo( + metadatas + .into_iter() + .map(|(_, dist_info_dir)| dist_info_dir.to_string()) + .collect::>() + .join(", "), + )); + } + }; + Ok((payload, dist_info_prefix)) +} + +/// Given an archive, read the `METADATA` from the `.dist-info` directory. +pub fn read_archive_metadata( + filename: &WheelFilename, + archive: &mut ZipArchive, +) -> Result, Error> { + let dist_info_prefix = + find_archive_dist_info(filename, archive.file_names().map(|name| (name, name)))?.1; + + let mut file = archive + .by_name(&format!("{dist_info_prefix}.dist-info/METADATA")) + .map_err(|err| Error::Zip(filename.to_string(), err))?; + + #[allow(clippy::cast_possible_truncation)] + let mut buffer = Vec::with_capacity(file.size() as usize); + file.read_to_end(&mut buffer)?; + + Ok(buffer) +} + +/// Find the `.dist-info` directory in an unzipped wheel. +/// +/// See: +pub fn find_flat_dist_info( + filename: &WheelFilename, + path: impl AsRef, +) -> Result { + // Iterate over `path` to find the `.dist-info` directory. It should be at the top-level. + let Some(dist_info) = fs_err::read_dir(path.as_ref())?.find_map(|entry| { + let entry = entry.ok()?; + let file_type = entry.file_type().ok()?; + if file_type.is_dir() { + let path = entry.path(); + + let extension = path.extension()?; + if extension != "dist-info" { + return None; + } + + let stem = path.file_stem()?; + let (name, version) = stem.to_str()?.rsplit_once('-')?; + if PackageName::from_str(name).ok()? != filename.name { + return None; + } + if Version::from_str(version).ok()? != filename.version { + return None; + } + + Some(path) + } else { + None + } + }) else { + return Err(Error::InvalidWheel( + "Missing .dist-info directory".to_string(), + )); + }; + + let Some(dist_info_prefix) = dist_info.file_stem() else { + return Err(Error::InvalidWheel( + "Missing .dist-info directory".to_string(), + )); + }; + + Ok(dist_info_prefix.to_string_lossy().to_string()) +} + +/// Read the wheel `METADATA` metadata from a `.dist-info` directory. +pub fn read_dist_info_metadata( + dist_info_prefix: &str, + wheel: impl AsRef, +) -> Result, Error> { + let metadata_file = wheel + .as_ref() + .join(format!("{dist_info_prefix}.dist-info/METADATA")); + Ok(fs_err::read(metadata_file)?) +} + +#[cfg(test)] +mod test { + use std::str::FromStr; + + use distribution_filename::WheelFilename; + + use crate::metadata::find_archive_dist_info; + + #[test] + fn test_dot_in_name() { + let files = [ + "mastodon/Mastodon.py", + "mastodon/__init__.py", + "mastodon/streaming.py", + "Mastodon.py-1.5.1.dist-info/DESCRIPTION.rst", + "Mastodon.py-1.5.1.dist-info/metadata.json", + "Mastodon.py-1.5.1.dist-info/top_level.txt", + "Mastodon.py-1.5.1.dist-info/WHEEL", + "Mastodon.py-1.5.1.dist-info/METADATA", + "Mastodon.py-1.5.1.dist-info/RECORD", + ]; + let filename = WheelFilename::from_str("Mastodon.py-1.5.1-py2.py3-none-any.whl").unwrap(); + let (_, dist_info_prefix) = + find_archive_dist_info(&filename, files.into_iter().map(|file| (file, file))).unwrap(); + assert_eq!(dist_info_prefix, "Mastodon.py-1.5.1"); + } +} diff --git a/crates/uv-client/src/error.rs b/crates/uv-client/src/error.rs index 461ea709b..542001d73 100644 --- a/crates/uv-client/src/error.rs +++ b/crates/uv-client/src/error.rs @@ -182,7 +182,7 @@ pub enum ErrorKind { metadata: PackageName, }, - #[error("The wheel {0} is not a valid zip file")] + #[error("Failed to unzip wheel: {0}")] Zip(WheelFilename, #[source] ZipError), #[error("Failed to write to the client cache")] diff --git a/crates/uv-client/src/registry_client.rs b/crates/uv-client/src/registry_client.rs index 3585b79be..b28960919 100644 --- a/crates/uv-client/src/registry_client.rs +++ b/crates/uv-client/src/registry_client.rs @@ -18,7 +18,7 @@ use url::Url; use distribution_filename::{DistFilename, SourceDistFilename, WheelFilename}; use distribution_types::{BuiltDist, File, FileLocation, IndexUrl, IndexUrls, Name}; -use install_wheel_rs::{find_dist_info, is_metadata_entry}; +use install_wheel_rs::metadata::{find_archive_dist_info, is_metadata_entry}; use pep440_rs::Version; use pypi_types::{Metadata23, SimpleJson}; use uv_auth::safe_copy_url_auth; @@ -602,7 +602,7 @@ async fn read_metadata_async_seek( .await .map_err(|err| ErrorKind::Zip(filename.clone(), err))?; - let (metadata_idx, _dist_info_prefix) = find_dist_info( + let (metadata_idx, _dist_info_prefix) = find_archive_dist_info( filename, zip_reader .file() diff --git a/crates/uv-client/src/remote_metadata.rs b/crates/uv-client/src/remote_metadata.rs index e4e3fe02d..548968e2f 100644 --- a/crates/uv-client/src/remote_metadata.rs +++ b/crates/uv-client/src/remote_metadata.rs @@ -3,7 +3,7 @@ use async_zip::tokio::read::seek::ZipFileReader; use tokio_util::compat::TokioAsyncReadCompatExt; use distribution_filename::WheelFilename; -use install_wheel_rs::find_dist_info; +use install_wheel_rs::metadata::find_archive_dist_info; use crate::{Error, ErrorKind}; @@ -65,7 +65,7 @@ pub(crate) async fn wheel_metadata_from_remote_zip( .await .map_err(|err| ErrorKind::Zip(filename.clone(), err))?; - let ((metadata_idx, metadata_entry), _dist_info_prefix) = find_dist_info( + let ((metadata_idx, metadata_entry), _dist_info_prefix) = find_archive_dist_info( filename, reader .file() diff --git a/crates/uv-distribution/src/distribution_database.rs b/crates/uv-distribution/src/distribution_database.rs index bdc925613..91b2eb95b 100644 --- a/crates/uv-distribution/src/distribution_database.rs +++ b/crates/uv-distribution/src/distribution_database.rs @@ -4,8 +4,9 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use futures::{FutureExt, TryStreamExt}; +use tokio::io::AsyncSeekExt; use tokio_util::compat::FuturesAsyncReadCompatExt; -use tracing::{info_span, instrument, Instrument}; +use tracing::{info_span, instrument, warn, Instrument}; use url::Url; use distribution_filename::WheelFilename; @@ -158,14 +159,33 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> ); // Download and unzip. - let archive = self + match self .stream_wheel(url.clone(), &wheel.filename, &wheel_entry, &dist) - .await?; - Ok(LocalWheel::Unzipped(UnzippedWheel { - dist: dist.clone(), - archive, - filename: wheel.filename.clone(), - })) + .await + { + Ok(archive) => Ok(LocalWheel::Unzipped(UnzippedWheel { + dist: dist.clone(), + archive, + filename: wheel.filename.clone(), + })), + Err(Error::Extract(err)) if err.is_http_streaming_unsupported() => { + warn!( + "Streaming unsupported for {dist}; downloading wheel to disk ({err})" + ); + + // If the request failed because streaming is unsupported, download the + // wheel directly. + let archive = self + .download_wheel(url, &wheel.filename, &wheel_entry, &dist) + .await?; + Ok(LocalWheel::Unzipped(UnzippedWheel { + dist: dist.clone(), + archive, + filename: wheel.filename.clone(), + })) + } + Err(err) => Err(err), + } } Dist::Built(BuiltDist::DirectUrl(wheel)) => { @@ -181,19 +201,43 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> ); // Download and unzip. - let archive = self + match self .stream_wheel( wheel.url.raw().clone(), &wheel.filename, &wheel_entry, &dist, ) - .await?; - Ok(LocalWheel::Unzipped(UnzippedWheel { - dist: dist.clone(), - archive, - filename: wheel.filename.clone(), - })) + .await + { + Ok(archive) => Ok(LocalWheel::Unzipped(UnzippedWheel { + dist: dist.clone(), + archive, + filename: wheel.filename.clone(), + })), + Err(Error::Client(err)) if err.is_http_streaming_unsupported() => { + warn!( + "Streaming unsupported for {dist}; downloading wheel to disk ({err})" + ); + + // If the request failed because streaming is unsupported, download the + // wheel directly. + let archive = self + .download_wheel( + wheel.url.raw().clone(), + &wheel.filename, + &wheel_entry, + &dist, + ) + .await?; + Ok(LocalWheel::Unzipped(UnzippedWheel { + dist: dist.clone(), + archive, + filename: wheel.filename.clone(), + })) + } + Err(err) => Err(err), + } } Dist::Built(BuiltDist::Path(wheel)) => { @@ -277,7 +321,18 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> ) -> Result<(Metadata23, Option), Error> { match dist { Dist::Built(built_dist) => { - Ok((self.client.wheel_metadata(built_dist).boxed().await?, None)) + match self.client.wheel_metadata(built_dist).boxed().await { + Ok(metadata) => Ok((metadata, None)), + Err(err) if err.is_http_streaming_unsupported() => { + warn!("Streaming unsupported when fetching metadata for {dist}; downloading wheel directly ({err})"); + + // If the request failed due to an error that could be resolved by + // downloading the wheel directly, try that. + let wheel = self.get_or_build_wheel(dist.clone()).await?; + Ok((wheel.metadata()?, None)) + } + Err(err) => Err(err.into()), + } } Dist::Source(source_dist) => { let no_build = match self.build_context.no_build() { @@ -437,6 +492,87 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> Ok(archive) } + /// Download a wheel from a URL, then unzip it into the cache. + async fn download_wheel( + &self, + url: Url, + filename: &WheelFilename, + wheel_entry: &CacheEntry, + dist: &Dist, + ) -> Result { + // Create an entry for the HTTP cache. + let http_entry = wheel_entry.with_file(format!("{}.http", filename.stem())); + + let download = |response: reqwest::Response| { + async { + let reader = response + .bytes_stream() + .map_err(|err| self.handle_response_errors(err)) + .into_async_read(); + + // Download the wheel to a temporary file. + let temp_file = + tempfile::tempfile_in(self.cache.root()).map_err(Error::CacheWrite)?; + let mut writer = tokio::io::BufWriter::new(tokio::fs::File::from_std(temp_file)); + tokio::io::copy(&mut reader.compat(), &mut writer) + .await + .map_err(Error::CacheWrite)?; + + // Unzip the wheel to a temporary directory. + let temp_dir = + tempfile::tempdir_in(self.cache.root()).map_err(Error::CacheWrite)?; + let mut file = writer.into_inner(); + file.seek(io::SeekFrom::Start(0)) + .await + .map_err(Error::CacheWrite)?; + let reader = tokio::io::BufReader::new(file); + uv_extract::seek::unzip(reader, temp_dir.path()).await?; + + // Persist the temporary directory to the directory store. + let archive = self + .cache + .persist(temp_dir.into_path(), wheel_entry.path()) + .map_err(Error::CacheRead)?; + Ok(archive) + } + .instrument(info_span!("wheel", wheel = %dist)) + }; + + let req = self + .client + .cached_client() + .uncached() + .get(url) + .header( + // `reqwest` defaults to accepting compressed responses. + // Specify identity encoding to get consistent .whl downloading + // behavior from servers. ref: https://github.com/pypa/pip/pull/1688 + "accept-encoding", + reqwest::header::HeaderValue::from_static("identity"), + ) + .build()?; + let cache_control = match self.client.connectivity() { + Connectivity::Online => CacheControl::from( + self.cache + .freshness(&http_entry, Some(&filename.name)) + .map_err(Error::CacheRead)?, + ), + Connectivity::Offline => CacheControl::AllowStale, + }; + + let archive = self + .client + .cached_client() + .get_serde(req, &http_entry, cache_control, download) + .await + .map_err(|err| match err { + CachedClientError::Callback(err) => err, + CachedClientError::Client(err) => Error::Client(err), + })?; + + Ok(archive) + } + /// Return the [`IndexLocations`] used by this resolver. pub fn index_locations(&self) -> &IndexLocations { self.build_context.index_locations() diff --git a/crates/uv-distribution/src/download.rs b/crates/uv-distribution/src/download.rs index 9b0a3a563..7ee3726b5 100644 --- a/crates/uv-distribution/src/download.rs +++ b/crates/uv-distribution/src/download.rs @@ -2,6 +2,9 @@ use std::path::{Path, PathBuf}; use distribution_filename::WheelFilename; use distribution_types::{CachedDist, Dist}; +use pypi_types::Metadata23; + +use crate::Error; /// A wheel that's been unzipped while downloading #[derive(Debug, Clone)] @@ -87,6 +90,15 @@ impl LocalWheel { Self::Built(wheel) => CachedDist::from_remote(wheel.dist, wheel.filename, archive), } } + + /// Read the [`Metadata23`] from a wheel. + pub fn metadata(&self) -> Result { + match self { + Self::Unzipped(wheel) => read_flat_wheel_metadata(&wheel.filename, &wheel.archive), + Self::Disk(wheel) => read_built_wheel_metadata(&wheel.filename, &wheel.path), + Self::Built(wheel) => read_built_wheel_metadata(&wheel.filename, &wheel.path), + } + } } impl UnzippedWheel { @@ -121,3 +133,25 @@ impl std::fmt::Display for LocalWheel { write!(f, "{}", self.remote()) } } + +/// Read the [`Metadata23`] from a built wheel. +fn read_built_wheel_metadata( + filename: &WheelFilename, + wheel: impl AsRef, +) -> Result { + let file = fs_err::File::open(wheel.as_ref()).map_err(Error::CacheRead)?; + let reader = std::io::BufReader::new(file); + let mut archive = zip::ZipArchive::new(reader)?; + let metadata = install_wheel_rs::metadata::read_archive_metadata(filename, &mut archive)?; + Ok(Metadata23::parse_metadata(&metadata)?) +} + +/// Read the [`Metadata23`] from an unzipped wheel. +fn read_flat_wheel_metadata( + filename: &WheelFilename, + wheel: impl AsRef, +) -> Result { + let dist_info = install_wheel_rs::metadata::find_flat_dist_info(filename, &wheel)?; + let metadata = install_wheel_rs::metadata::read_dist_info_metadata(&dist_info, &wheel)?; + Ok(Metadata23::parse_metadata(&metadata)?) +} diff --git a/crates/uv-distribution/src/error.rs b/crates/uv-distribution/src/error.rs index 913e0ed56..3bf8cd325 100644 --- a/crates/uv-distribution/src/error.rs +++ b/crates/uv-distribution/src/error.rs @@ -55,7 +55,7 @@ pub enum Error { Zip(#[from] ZipError), #[error("Source distribution directory contains neither readable pyproject.toml nor setup.py")] DirWithoutEntrypoint, - #[error("Failed to extract source distribution")] + #[error("Failed to extract archive")] Extract(#[from] uv_extract::Error), #[error("Source distribution not found at: {0}")] NotFound(PathBuf), diff --git a/crates/uv-distribution/src/source/mod.rs b/crates/uv-distribution/src/source/mod.rs index 3ec954223..b716be1f7 100644 --- a/crates/uv-distribution/src/source/mod.rs +++ b/crates/uv-distribution/src/source/mod.rs @@ -19,7 +19,7 @@ use distribution_types::{ DirectArchiveUrl, DirectGitUrl, Dist, FileLocation, GitSourceDist, LocalEditable, Name, PathSourceDist, RemoteSource, SourceDist, }; -use install_wheel_rs::read_dist_info; +use install_wheel_rs::metadata::read_archive_metadata; use pep508_rs::VerbatimUrl; use platform_tags::Tags; use pypi_types::Metadata23; @@ -903,7 +903,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { let reader = fs_err::tokio::File::open(&path) .await .map_err(Error::CacheRead)?; - uv_extract::stream::archive(tokio::io::BufReader::new(reader), path, &temp_dir.path()) + uv_extract::seek::archive(tokio::io::BufReader::new(reader), path, &temp_dir.path()) .await?; // Extract the top-level directory from the archive. @@ -1212,6 +1212,6 @@ fn read_wheel_metadata( let file = fs_err::File::open(wheel).map_err(Error::CacheRead)?; let reader = std::io::BufReader::new(file); let mut archive = ZipArchive::new(reader)?; - let dist_info = read_dist_info(filename, &mut archive)?; + let dist_info = read_archive_metadata(filename, &mut archive)?; Ok(Metadata23::parse_metadata(&dist_info)?) } diff --git a/crates/uv-extract/src/error.rs b/crates/uv-extract/src/error.rs index 4f9578560..275caed53 100644 --- a/crates/uv-extract/src/error.rs +++ b/crates/uv-extract/src/error.rs @@ -1,11 +1,9 @@ use std::{ffi::OsString, path::PathBuf}; -use zip::result::ZipError; - #[derive(Debug, thiserror::Error)] pub enum Error { #[error(transparent)] - Zip(#[from] ZipError), + Zip(#[from] zip::result::ZipError), #[error(transparent)] AsyncZip(#[from] async_zip::error::ZipError), #[error(transparent)] @@ -19,3 +17,15 @@ pub enum Error { #[error("The top-level of the archive must only contain a list directory, but it's empty")] EmptyArchive, } + +impl Error { + /// Returns `true` if the error is due to the server not supporting HTTP streaming. Most + /// commonly, this is due to serving ZIP files with features that are incompatible with + /// streaming, like data descriptors. + pub fn is_http_streaming_unsupported(&self) -> bool { + matches!( + self, + Self::AsyncZip(async_zip::error::ZipError::FeatureNotSupported(_)) + ) + } +} diff --git a/crates/uv-extract/src/lib.rs b/crates/uv-extract/src/lib.rs index 9cd38d3f0..20d433071 100644 --- a/crates/uv-extract/src/lib.rs +++ b/crates/uv-extract/src/lib.rs @@ -2,6 +2,7 @@ pub use error::Error; pub use sync::*; mod error; +pub mod seek; pub mod stream; mod sync; mod tar; diff --git a/crates/uv-extract/src/seek.rs b/crates/uv-extract/src/seek.rs new file mode 100644 index 000000000..c4adbbb6c --- /dev/null +++ b/crates/uv-extract/src/seek.rs @@ -0,0 +1,115 @@ +use std::path::Path; + +use rustc_hash::FxHashSet; +use tokio_util::compat::FuturesAsyncReadCompatExt; +use tokio_util::compat::TokioAsyncReadCompatExt; + +use crate::Error; + +/// Unzip a `.zip` archive into the target directory, requiring `Seek`. +/// +/// This is useful for unzipping files asynchronously that already exist on disk. +pub async fn unzip( + reader: R, + target: impl AsRef, +) -> Result<(), Error> { + let target = target.as_ref(); + let mut reader = reader.compat(); + let mut zip = async_zip::base::read::seek::ZipFileReader::new(&mut reader).await?; + + let mut directories = FxHashSet::default(); + + for index in 0..zip.file().entries().len() { + let reader = zip.reader_with_entry(index).await?; + + // Construct the (expected) path to the file on-disk. + let path = reader.entry().filename().as_str()?; + let path = target.join(path); + let is_dir = reader.entry().dir()?; + + // Either create the directory or write the file to disk. + if is_dir { + if directories.insert(path.clone()) { + fs_err::tokio::create_dir_all(path).await?; + } + } else { + if let Some(parent) = path.parent() { + if directories.insert(parent.to_path_buf()) { + fs_err::tokio::create_dir_all(parent).await?; + } + } + + // Copy the mode. + #[cfg(unix)] + let mode = reader.entry().unix_permissions(); + + // Copy the file contents. + let file = fs_err::tokio::File::create(&path).await?; + let mut writer = if let Ok(size) = usize::try_from(reader.entry().uncompressed_size()) { + tokio::io::BufWriter::with_capacity(size, file) + } else { + tokio::io::BufWriter::new(file) + }; + tokio::io::copy(&mut reader.compat(), &mut writer).await?; + + // See `uv_extract::stream::unzip`. + #[cfg(unix)] + { + use std::fs::Permissions; + use std::os::unix::fs::PermissionsExt; + + let Some(mode) = mode else { + continue; + }; + + // The executable bit is the only permission we preserve, otherwise we use the OS defaults. + // https://github.com/pypa/pip/blob/3898741e29b7279e7bffe044ecfbe20f6a438b1e/src/pip/_internal/utils/unpacking.py#L88-L100 + let has_any_executable_bit = mode & 0o111; + if has_any_executable_bit != 0 { + let permissions = fs_err::tokio::metadata(&path).await?.permissions(); + fs_err::tokio::set_permissions( + &path, + Permissions::from_mode(permissions.mode() | 0o111), + ) + .await?; + } + } + } + } + + Ok(()) +} + +/// Unzip a `.zip` or `.tar.gz` archive into the target directory, requiring `Seek`. +pub async fn archive( + reader: R, + source: impl AsRef, + target: impl AsRef, +) -> Result<(), Error> { + // `.zip` + if source + .as_ref() + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("zip")) + { + unzip(reader, target).await?; + return Ok(()); + } + + // `.tar.gz` + if source + .as_ref() + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("gz")) + && source.as_ref().file_stem().is_some_and(|stem| { + Path::new(stem) + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("tar")) + }) + { + crate::stream::untar(reader, target).await?; + return Ok(()); + } + + Err(Error::UnsupportedArchive(source.as_ref().to_path_buf())) +} diff --git a/crates/uv/tests/pip_compile.rs b/crates/uv/tests/pip_compile.rs index d64dfce05..50bee6658 100644 --- a/crates/uv/tests/pip_compile.rs +++ b/crates/uv/tests/pip_compile.rs @@ -4949,3 +4949,41 @@ fn metadata_2_2() -> Result<()> { Ok(()) } + +/// Resolve packages from an index that "doesn't support" zip file streaming (by way of using +/// data descriptors). +#[test] +fn no_stream() -> Result<()> { + let context = TestContext::new("3.12"); + + // Write to a requirements file. + let requirements_in = context.temp_dir.child("requirements.in"); + requirements_in + .write_str("hashb_foxglove_protocolbuffers_python==25.3.0.1.20240226043130+465630478360")?; + + uv_snapshot!(Command::new(get_bin()) + .arg("pip") + .arg("compile") + .arg("requirements.in") + .arg("--extra-index-url") + .arg("https://buf.build/gen/python") + .arg("--cache-dir") + .arg(context.cache_dir.path()) + .env("VIRTUAL_ENV", context.venv.as_os_str()) + .current_dir(&context.temp_dir), @r###" + success: true + exit_code: 0 + ----- stdout ----- + # This file was autogenerated by uv via the following command: + # uv pip compile requirements.in --cache-dir [CACHE_DIR] + hashb-foxglove-protocolbuffers-python==25.3.0.1.20240226043130+465630478360 + protobuf==4.25.3 + # via hashb-foxglove-protocolbuffers-python + + ----- stderr ----- + Resolved 2 packages in [TIME] + "### + ); + + Ok(()) +} diff --git a/crates/uv/tests/pip_sync.rs b/crates/uv/tests/pip_sync.rs index 7dfd99777..78aec59e6 100644 --- a/crates/uv/tests/pip_sync.rs +++ b/crates/uv/tests/pip_sync.rs @@ -2994,3 +2994,33 @@ requires-python = "<=3.5" Ok(()) } + +/// Install packages from an index that "doesn't support" zip file streaming (by way of using +/// data descriptors). +#[test] +fn no_stream() -> Result<()> { + let context = TestContext::new("3.12"); + + // Write to a requirements file. + let requirements_txt = context.temp_dir.child("requirements.txt"); + requirements_txt + .write_str("hashb_foxglove_protocolbuffers_python==25.3.0.1.20240226043130+465630478360")?; + + uv_snapshot!(command(&context) + .arg("requirements.txt") + .arg("--index-url") + .arg("https://buf.build/gen/python"), @r###" + success: true + exit_code: 0 + ----- stdout ----- + + ----- stderr ----- + Resolved 1 package in [TIME] + Downloaded 1 package in [TIME] + Installed 1 package in [TIME] + + hashb-foxglove-protocolbuffers-python==25.3.0.1.20240226043130+465630478360 + "### + ); + + Ok(()) +}