Move source distribution unpacking out of build (#2294)

## Summary

If a user provides a source distribution via a direct path, it can
either be an archive (like a `.tar.gz` or `.zip` file) or a directory.
If the former, we need to extract (e.g., unzip) the contents at some
point. Previously, this extraction was in `uv-build`; this PR lifts it
up to the distribution database.

The first benefit here is that various methods that take the
distribution are now simpler, as they can assume a directory.

The second benefit is that we no longer extract _multiple times_ when
working with a source distribution. (Previously, if we tried to get the
metadata, then fell back and built the wheel, we'd extract the wheel
_twice_.)
This commit is contained in:
Charlie Marsh 2024-03-07 19:40:11 -08:00 committed by GitHub
parent e321a2767d
commit 26f6919465
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 107 additions and 47 deletions

1
Cargo.lock generated
View file

@ -4282,7 +4282,6 @@ dependencies = [
"tokio",
"toml",
"tracing",
"uv-extract",
"uv-fs",
"uv-interpreter",
"uv-traits",

View file

@ -18,7 +18,6 @@ distribution-types = { path = "../distribution-types" }
pep508_rs = { path = "../pep508-rs" }
platform-host = { path = "../platform-host" }
pypi-types = { path = "../pypi-types" }
uv-extract = { path = "../uv-extract" }
uv-fs = { path = "../uv-fs" }
uv-interpreter = { path = "../uv-interpreter" }
uv-traits = { path = "../uv-traits", features = ["serde"] }

View file

@ -62,10 +62,6 @@ static DEFAULT_BACKEND: Lazy<Pep517Backend> = Lazy::new(|| Pep517Backend {
pub enum Error {
#[error(transparent)]
IO(#[from] io::Error),
#[error("Failed to extract archive: {0}")]
Extraction(PathBuf, #[source] uv_extract::Error),
#[error("Unsupported archive format (extension not recognized): {0}")]
UnsupportedArchiveType(String),
#[error("Invalid source distribution: {0}")]
InvalidSourceDist(String),
#[error("Invalid pyproject.toml")]
@ -74,8 +70,6 @@ pub enum Error {
EditableSetupPy,
#[error("Failed to install requirements from {0}")]
RequirementsInstall(&'static str, #[source] anyhow::Error),
#[error("Source distribution not found at: {0}")]
NotFound(PathBuf),
#[error("Failed to create temporary virtualenv")]
Virtualenv(#[from] uv_virtualenv::Error),
#[error("Failed to run {0}")]
@ -365,38 +359,10 @@ impl SourceBuild {
) -> Result<Self, Error> {
let temp_dir = tempdir_in(build_context.cache().root())?;
let metadata = match fs::metadata(source) {
Ok(metadata) => metadata,
Err(err) if err.kind() == io::ErrorKind::NotFound => {
return Err(Error::NotFound(source.to_path_buf()));
}
Err(err) => return Err(err.into()),
};
let source_root = if metadata.is_dir() {
source.to_path_buf()
} else {
debug!("Unpacking for build: {}", source.display());
let extracted = temp_dir.path().join("extracted");
// Unzip the archive into the temporary directory.
let reader = fs_err::tokio::File::open(source).await?;
uv_extract::stream::archive(tokio::io::BufReader::new(reader), source, &extracted)
.await
.map_err(|err| Error::Extraction(extracted.clone(), err))?;
// Extract the top-level directory from the archive.
match uv_extract::strip_component(&extracted) {
Ok(top_level) => top_level,
Err(uv_extract::Error::NonSingularArchive(_)) => extracted,
Err(err) => return Err(Error::Extraction(extracted.clone(), err)),
}
};
let source_tree = if let Some(subdir) = subdirectory {
source_root.join(subdir)
source.join(subdir)
} else {
source_root
source.to_path_buf()
};
let default_backend: Pep517Backend = DEFAULT_BACKEND.clone();

View file

@ -28,7 +28,7 @@ pub(crate) struct BuildArgs {
/// Directory to story the built wheel in
#[clap(short, long)]
wheels: Option<PathBuf>,
/// The source distribution to build, either a directory or a source archive.
/// The source distribution to build, as a directory.
sdist: PathBuf,
/// The subdirectory to build within the source distribution.
subdirectory: Option<PathBuf>,

View file

@ -1,3 +1,4 @@
use std::path::PathBuf;
use tokio::task::JoinError;
use zip::result::ZipError;
@ -56,6 +57,8 @@ pub enum Error {
DirWithoutEntrypoint,
#[error("Failed to extract source distribution")]
Extract(#[from] uv_extract::Error),
#[error("Source distribution not found at: {0}")]
NotFound(PathBuf),
/// Should not occur; only seen when another task panicked.
#[error("The task executor is broken, did some other task panic?")]

View file

@ -8,6 +8,7 @@ use anyhow::Result;
use fs_err::tokio as fs;
use futures::{FutureExt, TryStreamExt};
use reqwest::Response;
use tempfile::TempDir;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{debug, info_span, instrument, Instrument};
use url::Url;
@ -113,6 +114,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
Url::parse(url).map_err(|err| Error::Url(url.clone(), err))?
}
FileLocation::Path(path) => {
// Create a distribution to represent the local path.
let path_source_dist = PathSourceDist {
name: registry_source_dist.filename.name.clone(),
url: VerbatimUrl::unknown(
@ -121,7 +123,14 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
path: path.clone(),
editable: false,
};
return self.path(source_dist, &path_source_dist).boxed().await;
// If necessary, extract the archive.
let extracted = self.extract_archive(&path_source_dist).await?;
return self
.path(source_dist, &path_source_dist, extracted.path())
.boxed()
.await;
}
};
@ -147,7 +156,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
self.git(source_dist, git_source_dist).boxed().await?
}
SourceDist::Path(path_source_dist) => {
self.path(source_dist, path_source_dist).boxed().await?
// If necessary, extract the archive.
let extracted = self.extract_archive(path_source_dist).await?;
self.path(source_dist, path_source_dist, extracted.path())
.boxed()
.await?
}
};
@ -194,6 +208,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
Url::parse(url).map_err(|err| Error::Url(url.clone(), err))?
}
FileLocation::Path(path) => {
// Create a distribution to represent the local path.
let path_source_dist = PathSourceDist {
name: registry_source_dist.filename.name.clone(),
url: VerbatimUrl::unknown(
@ -202,8 +217,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
path: path.clone(),
editable: false,
};
// If necessary, extract the archive.
let extracted = self.extract_archive(&path_source_dist).await?;
return self
.path_metadata(source_dist, &path_source_dist)
.path_metadata(source_dist, &path_source_dist, extracted.path())
.boxed()
.await;
}
@ -233,7 +252,10 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
.await?
}
SourceDist::Path(path_source_dist) => {
self.path_metadata(source_dist, path_source_dist)
// If necessary, extract the archive.
let extracted = self.extract_archive(path_source_dist).await?;
self.path_metadata(source_dist, path_source_dist, extracted.path())
.boxed()
.await?
}
@ -468,6 +490,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
&self,
source_dist: &SourceDist,
path_source_dist: &PathSourceDist,
source_root: &Path,
) -> Result<BuiltWheelMetadata, Error> {
let cache_shard = self.build_context.cache().shard(
CacheBucket::BuiltWheels,
@ -510,7 +533,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
.map(|reporter| reporter.on_build_start(source_dist));
let (disk_filename, filename, metadata) = self
.build_source_dist(source_dist, &path_source_dist.path, None, &cache_shard)
.build_source_dist(source_dist, source_root, None, &cache_shard)
.await?;
if let Some(task) = task {
@ -540,6 +563,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
&self,
source_dist: &SourceDist,
path_source_dist: &PathSourceDist,
source_root: &Path,
) -> Result<Metadata21, Error> {
let cache_shard = self.build_context.cache().shard(
CacheBucket::BuiltWheels,
@ -586,7 +610,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
// If the backend supports `prepare_metadata_for_build_wheel`, use it.
if let Some(metadata) = self
.build_source_dist_metadata(source_dist, &path_source_dist.path, None)
.build_source_dist_metadata(source_dist, source_root, None)
.boxed()
.await?
{
@ -609,7 +633,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
.map(|reporter| reporter.on_build_start(source_dist));
let (_disk_filename, _filename, metadata) = self
.build_source_dist(source_dist, &path_source_dist.path, None, &cache_shard)
.build_source_dist(source_dist, source_root, None, &cache_shard)
.await?;
if let Some(task) = task {
@ -834,6 +858,51 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
Ok((fetch, subdirectory))
}
/// Extract a local source distribution, if it's stored as a `.tar.gz` or `.zip` archive.
///
/// TODO(charlie): Consider storing the extracted source in the cache, to avoid re-extracting
/// on every invocation.
async fn extract_archive(
&self,
source_dist: &'a PathSourceDist,
) -> Result<ExtractedSource<'a>, Error> {
// If necessary, unzip the source distribution.
let path = source_dist.path.as_path();
let metadata = match fs::metadata(&path).await {
Ok(metadata) => metadata,
Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
return Err(Error::NotFound(path.to_path_buf()));
}
Err(err) => return Err(Error::CacheRead(err)),
};
if metadata.is_dir() {
Ok(ExtractedSource::Directory(path))
} else {
debug!("Unpacking for build: {source_dist}");
let temp_dir = tempfile::tempdir_in(self.build_context.cache().root())
.map_err(Error::CacheWrite)?;
// Unzip the archive into the temporary directory.
let reader = fs_err::tokio::File::open(&path)
.await
.map_err(Error::CacheRead)?;
uv_extract::stream::archive(tokio::io::BufReader::new(reader), path, &temp_dir.path())
.await?;
// Extract the top-level directory from the archive.
let extracted = match uv_extract::strip_component(temp_dir.path()) {
Ok(top_level) => top_level,
Err(uv_extract::Error::NonSingularArchive(_)) => temp_dir.path().to_path_buf(),
Err(err) => return Err(err.into()),
};
Ok(ExtractedSource::Archive(extracted, temp_dir))
}
}
/// Build a source distribution, storing the built wheel in the cache.
///
/// Returns the un-normalized disk filename, the parsed, normalized filename and the metadata
@ -950,6 +1019,11 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
) -> Result<(Dist, String, WheelFilename, Metadata21), Error> {
debug!("Building (editable) {editable}");
// Verify that the editable exists.
if !editable.path.exists() {
return Err(Error::NotFound(editable.path.clone()));
}
// Build the wheel.
let disk_filename = self
.build_context
@ -980,6 +1054,26 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
}
}
#[derive(Debug)]
enum ExtractedSource<'a> {
/// The source distribution was passed in as a directory, and so doesn't need to be extracted.
Directory(&'a Path),
/// The source distribution was passed in as an archive, and was extracted into a temporary
/// directory.
#[allow(dead_code)]
Archive(PathBuf, TempDir),
}
impl ExtractedSource<'_> {
/// Return the [`Path`] to the extracted source root.
fn path(&self) -> &Path {
match self {
ExtractedSource::Directory(path) => path,
ExtractedSource::Archive(path, _) => path,
}
}
}
/// Read an existing HTTP-cached [`Manifest`], if it exists.
pub(crate) fn read_http_manifest(cache_entry: &CacheEntry) -> Result<Option<Manifest>, Error> {
match fs_err::File::open(cache_entry.path()) {

View file

@ -3283,7 +3283,6 @@ fn missing_editable_requirement() -> Result<()> {
----- stderr -----
error: Failed to build editables
Caused by: Failed to build editable: file://[TEMP_DIR]/django-3.2.8.tar.gz
Caused by: Source distribution not found at: /[TEMP_DIR]/django-3.2.8.tar.gz
"###);