Move archive extraction into its own crate (#647)

We have some shared utilities beyond `puffin-build` and
`puffin-distribution`, and further, I want to be able to access the
sdist archive extraction logic from `puffin-distribution`. This is
really generic, so moving into its own crate.
This commit is contained in:
Charlie Marsh 2023-12-13 23:49:09 -05:00 committed by GitHub
parent 388641643d
commit db7e2dedbb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 174 additions and 134 deletions

22
Cargo.lock generated
View file

@ -2243,7 +2243,6 @@ name = "puffin-build"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"flate2",
"fs-err", "fs-err",
"gourgeist", "gourgeist",
"indoc", "indoc",
@ -2252,6 +2251,7 @@ dependencies = [
"once_cell", "once_cell",
"pep508_rs", "pep508_rs",
"platform-host", "platform-host",
"puffin-extract",
"puffin-fs", "puffin-fs",
"puffin-interpreter", "puffin-interpreter",
"puffin-traits", "puffin-traits",
@ -2260,14 +2260,11 @@ dependencies = [
"regex", "regex",
"serde", "serde",
"serde_json", "serde_json",
"tar",
"tempfile", "tempfile",
"thiserror", "thiserror",
"tokio", "tokio",
"toml 0.8.8", "toml 0.8.8",
"tracing", "tracing",
"which",
"zip",
] ]
[[package]] [[package]]
@ -2463,17 +2460,16 @@ dependencies = [
"platform-tags", "platform-tags",
"puffin-cache", "puffin-cache",
"puffin-client", "puffin-client",
"puffin-extract",
"puffin-fs", "puffin-fs",
"puffin-git", "puffin-git",
"puffin-normalize", "puffin-normalize",
"puffin-traits", "puffin-traits",
"pypi-types", "pypi-types",
"rayon",
"reqwest", "reqwest",
"rustc-hash", "rustc-hash",
"serde", "serde",
"serde_json", "serde_json",
"sha2",
"tempfile", "tempfile",
"thiserror", "thiserror",
"tokio", "tokio",
@ -2483,6 +2479,19 @@ dependencies = [
"zip", "zip",
] ]
[[package]]
name = "puffin-extract"
version = "0.0.1"
dependencies = [
"flate2",
"fs-err",
"rayon",
"tar",
"thiserror",
"tokio",
"zip",
]
[[package]] [[package]]
name = "puffin-fs" name = "puffin-fs"
version = "0.0.1" version = "0.0.1"
@ -2527,6 +2536,7 @@ dependencies = [
"puffin-cache", "puffin-cache",
"puffin-client", "puffin-client",
"puffin-distribution", "puffin-distribution",
"puffin-extract",
"puffin-fs", "puffin-fs",
"puffin-git", "puffin-git",
"puffin-interpreter", "puffin-interpreter",

View file

@ -68,6 +68,10 @@ Implements the traits defined in `puffin-traits`.
Client for interacting with built distributions (wheels) and source distributions (sdists). Client for interacting with built distributions (wheels) and source distributions (sdists).
Capable of fetching metadata, distribution contents, etc. Capable of fetching metadata, distribution contents, etc.
## [puffin-extract](./puffin-extract)
Utilities for extracting files from archives.
## [puffin-fs](./puffin-fs) ## [puffin-fs](./puffin-fs)
Utilities for interacting with the filesystem. Utilities for interacting with the filesystem.

View file

@ -17,13 +17,13 @@ workspace = true
gourgeist = { path = "../gourgeist" } gourgeist = { path = "../gourgeist" }
pep508_rs = { path = "../pep508-rs" } pep508_rs = { path = "../pep508-rs" }
platform-host = { path = "../platform-host" } platform-host = { path = "../platform-host" }
puffin-extract = { path = "../puffin-extract" }
puffin-fs = { path = "../puffin-fs" } puffin-fs = { path = "../puffin-fs" }
puffin-interpreter = { path = "../puffin-interpreter" } puffin-interpreter = { path = "../puffin-interpreter" }
puffin-traits = { path = "../puffin-traits" } puffin-traits = { path = "../puffin-traits" }
pypi-types = { path = "../pypi-types" } pypi-types = { path = "../pypi-types" }
anyhow = { workspace = true } anyhow = { workspace = true }
flate2 = { workspace = true }
fs-err = { workspace = true } fs-err = { workspace = true }
indoc = { workspace = true } indoc = { workspace = true }
itertools = { workspace = true } itertools = { workspace = true }
@ -32,14 +32,11 @@ pyproject-toml = { workspace = true }
regex = { workspace = true } regex = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
tar = { workspace = true }
tempfile = { workspace = true } tempfile = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
tokio = { workspace = true, features = ["sync", "process"] } tokio = { workspace = true, features = ["sync", "process"] }
toml = { workspace = true } toml = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
which = { workspace = true}
zip = { workspace = true }
[dev-dependencies] [dev-dependencies]
insta = { version = "1.34.0" } insta = { version = "1.34.0" }

View file

@ -13,22 +13,19 @@ use std::process::Output;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
use flate2::read::GzDecoder;
use fs_err as fs; use fs_err as fs;
use fs_err::{DirEntry, File}; use fs_err::DirEntry;
use indoc::formatdoc; use indoc::formatdoc;
use itertools::Itertools; use itertools::Itertools;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use pyproject_toml::{BuildSystem, Project}; use pyproject_toml::{BuildSystem, Project};
use regex::Regex; use regex::Regex;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tar::Archive;
use tempfile::{tempdir, tempdir_in, TempDir}; use tempfile::{tempdir, tempdir_in, TempDir};
use thiserror::Error; use thiserror::Error;
use tokio::process::Command; use tokio::process::Command;
use tokio::sync::Mutex; use tokio::sync::Mutex;
use tracing::{debug, info_span, instrument}; use tracing::{debug, info_span, instrument};
use zip::ZipArchive;
use pep508_rs::Requirement; use pep508_rs::Requirement;
use puffin_interpreter::{Interpreter, Virtualenv}; use puffin_interpreter::{Interpreter, Virtualenv};
@ -46,8 +43,8 @@ static MISSING_HEADER_RE: Lazy<Regex> = Lazy::new(|| {
pub enum Error { pub enum Error {
#[error(transparent)] #[error(transparent)]
IO(#[from] io::Error), IO(#[from] io::Error),
#[error("Failed to read zip file")] #[error("Failed to extract archive: {0}")]
Zip(#[from] zip::result::ZipError), Extraction(PathBuf, #[source] puffin_extract::Error),
#[error("Unsupported archive format (extension not recognized): {0}")] #[error("Unsupported archive format (extension not recognized): {0}")]
UnsupportedArchiveType(String), UnsupportedArchiveType(String),
#[error("Invalid source distribution: {0}")] #[error("Invalid source distribution: {0}")]
@ -671,40 +668,12 @@ async fn create_pep517_build_environment(
/// Returns the directory with the `pyproject.toml`/`setup.py` /// Returns the directory with the `pyproject.toml`/`setup.py`
#[instrument(skip_all, fields(sdist = ? sdist.file_name().unwrap_or(sdist.as_os_str())))] #[instrument(skip_all, fields(sdist = ? sdist.file_name().unwrap_or(sdist.as_os_str())))]
fn extract_archive(sdist: &Path, extracted: &PathBuf) -> Result<PathBuf, Error> { fn extract_archive(sdist: &Path, extracted: &PathBuf) -> Result<PathBuf, Error> {
if sdist puffin_extract::extract_archive(sdist, extracted)
.extension() .map_err(|err| Error::Extraction(sdist.to_path_buf(), err))?;
.is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
{
// .zip
let mut archive = ZipArchive::new(File::open(sdist)?)?;
archive.extract(extracted)?;
} else if sdist
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
&& sdist.file_stem().is_some_and(|stem| {
Path::new(stem)
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
})
{
// .tar.gz
let mut archive = Archive::new(GzDecoder::new(File::open(sdist)?));
// https://github.com/alexcrichton/tar-rs/issues/349
archive.set_preserve_mtime(false);
archive.unpack(extracted)?;
} else {
return Err(Error::UnsupportedArchiveType(
sdist
.file_name()
.unwrap_or(sdist.as_os_str())
.to_string_lossy()
.to_string(),
));
}
// > A .tar.gz source distribution (sdist) contains a single top-level directory called // > A .tar.gz source distribution (sdist) contains a single top-level directory called
// > `{name}-{version}` (e.g. foo-1.0), containing the source files of the package. // > `{name}-{version}` (e.g. foo-1.0), containing the source files of the package.
// TODO(konstin): Verify the name of the directory // TODO(konstin): Verify the name of the directory.
let top_level = fs::read_dir(extracted)?.collect::<io::Result<Vec<DirEntry>>>()?; let top_level = fs::read_dir(extracted)?.collect::<io::Result<Vec<DirEntry>>>()?;
let [root] = top_level.as_slice() else { let [root] = top_level.as_slice() else {
return Err(Error::InvalidSourceDist(format!( return Err(Error::InvalidSourceDist(format!(

View file

@ -20,6 +20,7 @@ pep440_rs = { path = "../pep440-rs" }
platform-tags = { path = "../platform-tags" } platform-tags = { path = "../platform-tags" }
puffin-cache = { path = "../puffin-cache" } puffin-cache = { path = "../puffin-cache" }
puffin-client = { path = "../puffin-client" } puffin-client = { path = "../puffin-client" }
puffin-extract = { path = "../puffin-extract" }
puffin-fs = { path = "../puffin-fs" } puffin-fs = { path = "../puffin-fs" }
puffin-git = { path = "../puffin-git" } puffin-git = { path = "../puffin-git" }
puffin-normalize = { path = "../puffin-normalize" } puffin-normalize = { path = "../puffin-normalize" }
@ -31,12 +32,10 @@ bytesize = { workspace = true }
fs-err = { workspace = true } fs-err = { workspace = true }
fs2 = { workspace = true } fs2 = { workspace = true }
futures = { workspace = true } futures = { workspace = true }
rayon = { workspace = true }
reqwest = { workspace = true } reqwest = { workspace = true }
rustc-hash = { workspace = true } rustc-hash = { workspace = true }
serde = { workspace = true , features = ["derive"] } serde = { workspace = true , features = ["derive"] }
serde_json = { workspace = true } serde_json = { workspace = true }
sha2 = { workspace = true }
tempfile = { workspace = true } tempfile = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }

View file

@ -3,7 +3,7 @@ pub use download::{DiskWheel, InMemoryWheel, LocalWheel};
pub use index::{BuiltWheelIndex, RegistryWheelIndex}; pub use index::{BuiltWheelIndex, RegistryWheelIndex};
pub use reporter::Reporter; pub use reporter::Reporter;
pub use source_dist::{SourceDistCachedBuilder, SourceDistError}; pub use source_dist::{SourceDistCachedBuilder, SourceDistError};
pub use unzip::{Unzip, UnzipError}; pub use unzip::Unzip;
mod distribution_database; mod distribution_database;
mod download; mod download;
@ -13,4 +13,3 @@ mod locks;
mod reporter; mod reporter;
mod source_dist; mod source_dist;
mod unzip; mod unzip;
mod vendor;

View file

@ -1,49 +1,35 @@
use std::io;
use std::io::{Read, Seek};
use std::path::Path; use std::path::Path;
use rayon::prelude::*; use puffin_extract::{unzip_archive, Error};
use thiserror::Error;
use zip::result::ZipError;
use zip::ZipArchive;
use crate::download::BuiltWheel; use crate::download::BuiltWheel;
use crate::vendor::{CloneableSeekableReader, HasLength};
use crate::{DiskWheel, InMemoryWheel, LocalWheel}; use crate::{DiskWheel, InMemoryWheel, LocalWheel};
#[derive(Debug, Error)]
pub enum UnzipError {
#[error(transparent)]
Zip(#[from] ZipError),
#[error(transparent)]
Io(#[from] io::Error),
}
pub trait Unzip { pub trait Unzip {
/// Unzip a wheel into the target directory. /// Unzip a wheel into the target directory.
fn unzip(&self, target: &Path) -> Result<(), UnzipError>; fn unzip(&self, target: &Path) -> Result<(), Error>;
} }
impl Unzip for InMemoryWheel { impl Unzip for InMemoryWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> { fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(std::io::Cursor::new(&self.buffer), target) unzip_archive(std::io::Cursor::new(&self.buffer), target)
} }
} }
impl Unzip for DiskWheel { impl Unzip for DiskWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> { fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(fs_err::File::open(&self.path)?, target) unzip_archive(fs_err::File::open(&self.path)?, target)
} }
} }
impl Unzip for BuiltWheel { impl Unzip for BuiltWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> { fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(fs_err::File::open(&self.path)?, target) unzip_archive(fs_err::File::open(&self.path)?, target)
} }
} }
impl Unzip for LocalWheel { impl Unzip for LocalWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> { fn unzip(&self, target: &Path) -> Result<(), Error> {
match self { match self {
LocalWheel::InMemory(wheel) => wheel.unzip(target), LocalWheel::InMemory(wheel) => wheel.unzip(target),
LocalWheel::Disk(wheel) => wheel.unzip(target), LocalWheel::Disk(wheel) => wheel.unzip(target),
@ -51,52 +37,3 @@ impl Unzip for LocalWheel {
} }
} }
} }
/// Unzip a zip archive into the target directory.
fn unzip_archive<R: Send + Read + Seek + HasLength>(
reader: R,
target: &Path,
) -> Result<(), UnzipError> {
// Unzip in parallel.
let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?;
(0..archive.len())
.par_bridge()
.map(|file_number| {
let mut archive = archive.clone();
let mut file = archive.by_index(file_number)?;
// Determine the path of the file within the wheel.
let file_path = match file.enclosed_name() {
Some(path) => path.to_owned(),
None => return Ok(()),
};
// Create necessary parent directories.
let path = target.join(file_path);
if file.is_dir() {
fs_err::create_dir_all(path)?;
return Ok(());
}
if let Some(parent) = path.parent() {
fs_err::create_dir_all(parent)?;
}
// Write the file.
let mut outfile = fs_err::File::create(&path)?;
std::io::copy(&mut file, &mut outfile)?;
// Set permissions.
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
if let Some(mode) = file.unix_mode() {
std::fs::set_permissions(&path, Permissions::from_mode(mode))?;
}
}
Ok(())
})
.collect::<Result<_, UnzipError>>()
}

View file

@ -1,3 +0,0 @@
pub(crate) use cloneable_seekable_reader::{CloneableSeekableReader, HasLength};
mod cloneable_seekable_reader;

View file

@ -0,0 +1,22 @@
[package]
name = "puffin-extract"
version = "0.0.1"
edition = { workspace = true }
rust-version = { workspace = true }
homepage = { workspace = true }
documentation = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
[lints]
workspace = true
[dependencies]
flate2 = { workspace = true }
fs-err = { workspace = true }
rayon = { workspace = true }
tar = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
zip = { workspace = true }

View file

@ -0,0 +1,104 @@
use std::path::{Path, PathBuf};
use rayon::prelude::*;
use zip::result::ZipError;
use zip::ZipArchive;
pub use crate::vendor::{CloneableSeekableReader, HasLength};
mod vendor;
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error(transparent)]
Zip(#[from] ZipError),
#[error(transparent)]
Io(#[from] std::io::Error),
#[error("Unsupported archive type: {0}")]
UnsupportedArchive(PathBuf),
}
/// Unzip a zip archive into the target directory.
pub fn unzip_archive<R: Send + std::io::Read + std::io::Seek + HasLength>(
reader: R,
target: &Path,
) -> Result<(), Error> {
// Unzip in parallel.
let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?;
(0..archive.len())
.par_bridge()
.map(|file_number| {
let mut archive = archive.clone();
let mut file = archive.by_index(file_number)?;
// Determine the path of the file within the wheel.
let file_path = match file.enclosed_name() {
Some(path) => path.to_owned(),
None => return Ok(()),
};
// Create necessary parent directories.
let path = target.join(file_path);
if file.is_dir() {
fs_err::create_dir_all(path)?;
return Ok(());
}
if let Some(parent) = path.parent() {
fs_err::create_dir_all(parent)?;
}
// Write the file.
let mut outfile = fs_err::File::create(&path)?;
std::io::copy(&mut file, &mut outfile)?;
// Set permissions.
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
if let Some(mode) = file.unix_mode() {
std::fs::set_permissions(&path, Permissions::from_mode(mode))?;
}
}
Ok(())
})
.collect::<Result<_, Error>>()
}
/// Extract a `.zip` or `.tar.gz` archive into the target directory.
pub fn extract_archive(source: impl AsRef<Path>, target: impl AsRef<Path>) -> Result<(), Error> {
// .zip
if source
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
{
unzip_archive(fs_err::File::open(source.as_ref())?, target.as_ref())?;
return Ok(());
}
// .tar.gz
if source
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
if source.as_ref().file_stem().is_some_and(|stem| {
Path::new(stem)
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
}) {
let mut archive = tar::Archive::new(flate2::read::GzDecoder::new(fs_err::File::open(
source.as_ref(),
)?));
// https://github.com/alexcrichton/tar-rs/issues/349
archive.set_preserve_mtime(false);
archive.unpack(target)?;
return Ok(());
}
}
Err(Error::UnsupportedArchive(source.as_ref().to_path_buf()))
}

View file

@ -16,7 +16,8 @@ use std::{
/// A trait to represent some reader which has a total length known in /// A trait to represent some reader which has a total length known in
/// advance. This is roughly equivalent to the nightly /// advance. This is roughly equivalent to the nightly
/// [`Seek::stream_len`] API. /// [`Seek::stream_len`] API.
pub(crate) trait HasLength { #[allow(clippy::len_without_is_empty)]
pub trait HasLength {
/// Return the current total length of this stream. /// Return the current total length of this stream.
fn len(&self) -> u64; fn len(&self) -> u64;
} }
@ -25,7 +26,7 @@ pub(crate) trait HasLength {
/// and thus can be cloned cheaply. It supports seeking; each cloned instance /// and thus can be cloned cheaply. It supports seeking; each cloned instance
/// maintains its own pointer into the file, and the underlying instance /// maintains its own pointer into the file, and the underlying instance
/// is seeked prior to each read. /// is seeked prior to each read.
pub(crate) struct CloneableSeekableReader<R: Read + Seek + HasLength> { pub struct CloneableSeekableReader<R: Read + Seek + HasLength> {
file: Arc<Mutex<R>>, file: Arc<Mutex<R>>,
pos: u64, pos: u64,
// TODO determine and store this once instead of per cloneable file // TODO determine and store this once instead of per cloneable file

View file

@ -0,0 +1,3 @@
pub use cloneable_seekable_reader::{CloneableSeekableReader, HasLength};
mod cloneable_seekable_reader;

View file

@ -14,14 +14,15 @@ workspace = true
[dependencies] [dependencies]
distribution-filename = { path = "../distribution-filename" } distribution-filename = { path = "../distribution-filename" }
distribution-types = { path = "../distribution-types" }
install-wheel-rs = { path = "../install-wheel-rs", default-features = false } install-wheel-rs = { path = "../install-wheel-rs", default-features = false }
pep440_rs = { path = "../pep440-rs" } pep440_rs = { path = "../pep440-rs" }
pep508_rs = { path = "../pep508-rs" } pep508_rs = { path = "../pep508-rs" }
platform-tags = { path = "../platform-tags" }
puffin-cache = { path = "../puffin-cache" } puffin-cache = { path = "../puffin-cache" }
puffin-client = { path = "../puffin-client" } puffin-client = { path = "../puffin-client" }
distribution-types = { path = "../distribution-types" }
platform-tags = { path = "../platform-tags" }
puffin-distribution = { path = "../puffin-distribution" } puffin-distribution = { path = "../puffin-distribution" }
puffin-extract = { path = "../puffin-extract" }
puffin-fs = { path = "../puffin-fs" } puffin-fs = { path = "../puffin-fs" }
puffin-git = { path = "../puffin-git" } puffin-git = { path = "../puffin-git" }
puffin-interpreter = { path = "../puffin-interpreter" } puffin-interpreter = { path = "../puffin-interpreter" }

View file

@ -1,6 +1,5 @@
use std::cmp::Reverse; use std::cmp::Reverse;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use futures::{StreamExt, TryFutureExt}; use futures::{StreamExt, TryFutureExt};
@ -12,18 +11,16 @@ use distribution_types::{CachedDist, Dist, RemoteSource, SourceDist};
use platform_tags::Tags; use platform_tags::Tags;
use puffin_cache::Cache; use puffin_cache::Cache;
use puffin_client::RegistryClient; use puffin_client::RegistryClient;
use puffin_distribution::{ use puffin_distribution::{DistributionDatabase, DistributionDatabaseError, LocalWheel, Unzip};
DistributionDatabase, DistributionDatabaseError, LocalWheel, Unzip, UnzipError,
};
use puffin_traits::{BuildContext, OnceMap}; use puffin_traits::{BuildContext, OnceMap};
#[derive(thiserror::Error, Debug)] #[derive(thiserror::Error, Debug)]
pub enum Error { pub enum Error {
#[error("Failed to unzip wheel: {0}")] #[error("Failed to unzip wheel: {0}")]
Unzip(Dist, #[source] UnzipError), Unzip(Dist, #[source] puffin_extract::Error),
#[error("Failed to fetch wheel: {0}")] #[error("Failed to fetch wheel: {0}")]
Fetch(Dist, #[source] DistributionDatabaseError), Fetch(Dist, #[source] DistributionDatabaseError),
/// Should not occur, i've only seen it when another task panicked /// Should not occur; only seen when another task panicked.
#[error("The task executor is broken, did some other task panic?")] #[error("The task executor is broken, did some other task panic?")]
Join(#[from] JoinError), Join(#[from] JoinError),
#[error("Unzip failed in another thread: {0}")] #[error("Unzip failed in another thread: {0}")]
@ -160,7 +157,7 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> {
// Unzip the wheel. // Unzip the wheel.
let normalized_path = tokio::task::spawn_blocking({ let normalized_path = tokio::task::spawn_blocking({
move || -> Result<PathBuf, UnzipError> { move || -> Result<PathBuf, puffin_extract::Error> {
// Unzip the wheel into a temporary directory. // Unzip the wheel into a temporary directory.
let parent = download let parent = download
.target() .target()