Move archive extraction into its own crate (#647)

We have some shared utilities beyond `puffin-build` and
`puffin-distribution`, and further, I want to be able to access the
sdist archive extraction logic from `puffin-distribution`. This is
really generic, so moving into its own crate.
This commit is contained in:
Charlie Marsh 2023-12-13 23:49:09 -05:00 committed by GitHub
parent 388641643d
commit db7e2dedbb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 174 additions and 134 deletions

22
Cargo.lock generated
View file

@ -2243,7 +2243,6 @@ name = "puffin-build"
version = "0.0.1"
dependencies = [
"anyhow",
"flate2",
"fs-err",
"gourgeist",
"indoc",
@ -2252,6 +2251,7 @@ dependencies = [
"once_cell",
"pep508_rs",
"platform-host",
"puffin-extract",
"puffin-fs",
"puffin-interpreter",
"puffin-traits",
@ -2260,14 +2260,11 @@ dependencies = [
"regex",
"serde",
"serde_json",
"tar",
"tempfile",
"thiserror",
"tokio",
"toml 0.8.8",
"tracing",
"which",
"zip",
]
[[package]]
@ -2463,17 +2460,16 @@ dependencies = [
"platform-tags",
"puffin-cache",
"puffin-client",
"puffin-extract",
"puffin-fs",
"puffin-git",
"puffin-normalize",
"puffin-traits",
"pypi-types",
"rayon",
"reqwest",
"rustc-hash",
"serde",
"serde_json",
"sha2",
"tempfile",
"thiserror",
"tokio",
@ -2483,6 +2479,19 @@ dependencies = [
"zip",
]
[[package]]
name = "puffin-extract"
version = "0.0.1"
dependencies = [
"flate2",
"fs-err",
"rayon",
"tar",
"thiserror",
"tokio",
"zip",
]
[[package]]
name = "puffin-fs"
version = "0.0.1"
@ -2527,6 +2536,7 @@ dependencies = [
"puffin-cache",
"puffin-client",
"puffin-distribution",
"puffin-extract",
"puffin-fs",
"puffin-git",
"puffin-interpreter",

View file

@ -68,6 +68,10 @@ Implements the traits defined in `puffin-traits`.
Client for interacting with built distributions (wheels) and source distributions (sdists).
Capable of fetching metadata, distribution contents, etc.
## [puffin-extract](./puffin-extract)
Utilities for extracting files from archives.
## [puffin-fs](./puffin-fs)
Utilities for interacting with the filesystem.

View file

@ -17,13 +17,13 @@ workspace = true
gourgeist = { path = "../gourgeist" }
pep508_rs = { path = "../pep508-rs" }
platform-host = { path = "../platform-host" }
puffin-extract = { path = "../puffin-extract" }
puffin-fs = { path = "../puffin-fs" }
puffin-interpreter = { path = "../puffin-interpreter" }
puffin-traits = { path = "../puffin-traits" }
pypi-types = { path = "../pypi-types" }
anyhow = { workspace = true }
flate2 = { workspace = true }
fs-err = { workspace = true }
indoc = { workspace = true }
itertools = { workspace = true }
@ -32,14 +32,11 @@ pyproject-toml = { workspace = true }
regex = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tar = { workspace = true }
tempfile = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true, features = ["sync", "process"] }
toml = { workspace = true }
tracing = { workspace = true }
which = { workspace = true}
zip = { workspace = true }
[dev-dependencies]
insta = { version = "1.34.0" }

View file

@ -13,22 +13,19 @@ use std::process::Output;
use std::str::FromStr;
use std::sync::Arc;
use flate2::read::GzDecoder;
use fs_err as fs;
use fs_err::{DirEntry, File};
use fs_err::DirEntry;
use indoc::formatdoc;
use itertools::Itertools;
use once_cell::sync::Lazy;
use pyproject_toml::{BuildSystem, Project};
use regex::Regex;
use serde::{Deserialize, Serialize};
use tar::Archive;
use tempfile::{tempdir, tempdir_in, TempDir};
use thiserror::Error;
use tokio::process::Command;
use tokio::sync::Mutex;
use tracing::{debug, info_span, instrument};
use zip::ZipArchive;
use pep508_rs::Requirement;
use puffin_interpreter::{Interpreter, Virtualenv};
@ -46,8 +43,8 @@ static MISSING_HEADER_RE: Lazy<Regex> = Lazy::new(|| {
pub enum Error {
#[error(transparent)]
IO(#[from] io::Error),
#[error("Failed to read zip file")]
Zip(#[from] zip::result::ZipError),
#[error("Failed to extract archive: {0}")]
Extraction(PathBuf, #[source] puffin_extract::Error),
#[error("Unsupported archive format (extension not recognized): {0}")]
UnsupportedArchiveType(String),
#[error("Invalid source distribution: {0}")]
@ -671,40 +668,12 @@ async fn create_pep517_build_environment(
/// Returns the directory with the `pyproject.toml`/`setup.py`
#[instrument(skip_all, fields(sdist = ? sdist.file_name().unwrap_or(sdist.as_os_str())))]
fn extract_archive(sdist: &Path, extracted: &PathBuf) -> Result<PathBuf, Error> {
if sdist
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
{
// .zip
let mut archive = ZipArchive::new(File::open(sdist)?)?;
archive.extract(extracted)?;
} else if sdist
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
&& sdist.file_stem().is_some_and(|stem| {
Path::new(stem)
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
})
{
// .tar.gz
let mut archive = Archive::new(GzDecoder::new(File::open(sdist)?));
// https://github.com/alexcrichton/tar-rs/issues/349
archive.set_preserve_mtime(false);
archive.unpack(extracted)?;
} else {
return Err(Error::UnsupportedArchiveType(
sdist
.file_name()
.unwrap_or(sdist.as_os_str())
.to_string_lossy()
.to_string(),
));
}
puffin_extract::extract_archive(sdist, extracted)
.map_err(|err| Error::Extraction(sdist.to_path_buf(), err))?;
// > A .tar.gz source distribution (sdist) contains a single top-level directory called
// > `{name}-{version}` (e.g. foo-1.0), containing the source files of the package.
// TODO(konstin): Verify the name of the directory
// TODO(konstin): Verify the name of the directory.
let top_level = fs::read_dir(extracted)?.collect::<io::Result<Vec<DirEntry>>>()?;
let [root] = top_level.as_slice() else {
return Err(Error::InvalidSourceDist(format!(

View file

@ -20,6 +20,7 @@ pep440_rs = { path = "../pep440-rs" }
platform-tags = { path = "../platform-tags" }
puffin-cache = { path = "../puffin-cache" }
puffin-client = { path = "../puffin-client" }
puffin-extract = { path = "../puffin-extract" }
puffin-fs = { path = "../puffin-fs" }
puffin-git = { path = "../puffin-git" }
puffin-normalize = { path = "../puffin-normalize" }
@ -31,12 +32,10 @@ bytesize = { workspace = true }
fs-err = { workspace = true }
fs2 = { workspace = true }
futures = { workspace = true }
rayon = { workspace = true }
reqwest = { workspace = true }
rustc-hash = { workspace = true }
serde = { workspace = true , features = ["derive"] }
serde_json = { workspace = true }
sha2 = { workspace = true }
tempfile = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }

View file

@ -3,7 +3,7 @@ pub use download::{DiskWheel, InMemoryWheel, LocalWheel};
pub use index::{BuiltWheelIndex, RegistryWheelIndex};
pub use reporter::Reporter;
pub use source_dist::{SourceDistCachedBuilder, SourceDistError};
pub use unzip::{Unzip, UnzipError};
pub use unzip::Unzip;
mod distribution_database;
mod download;
@ -13,4 +13,3 @@ mod locks;
mod reporter;
mod source_dist;
mod unzip;
mod vendor;

View file

@ -1,49 +1,35 @@
use std::io;
use std::io::{Read, Seek};
use std::path::Path;
use rayon::prelude::*;
use thiserror::Error;
use zip::result::ZipError;
use zip::ZipArchive;
use puffin_extract::{unzip_archive, Error};
use crate::download::BuiltWheel;
use crate::vendor::{CloneableSeekableReader, HasLength};
use crate::{DiskWheel, InMemoryWheel, LocalWheel};
#[derive(Debug, Error)]
pub enum UnzipError {
#[error(transparent)]
Zip(#[from] ZipError),
#[error(transparent)]
Io(#[from] io::Error),
}
pub trait Unzip {
/// Unzip a wheel into the target directory.
fn unzip(&self, target: &Path) -> Result<(), UnzipError>;
fn unzip(&self, target: &Path) -> Result<(), Error>;
}
impl Unzip for InMemoryWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(std::io::Cursor::new(&self.buffer), target)
}
}
impl Unzip for DiskWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(fs_err::File::open(&self.path)?, target)
}
}
impl Unzip for BuiltWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(fs_err::File::open(&self.path)?, target)
}
}
impl Unzip for LocalWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
match self {
LocalWheel::InMemory(wheel) => wheel.unzip(target),
LocalWheel::Disk(wheel) => wheel.unzip(target),
@ -51,52 +37,3 @@ impl Unzip for LocalWheel {
}
}
}
/// Unzip a zip archive into the target directory.
fn unzip_archive<R: Send + Read + Seek + HasLength>(
reader: R,
target: &Path,
) -> Result<(), UnzipError> {
// Unzip in parallel.
let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?;
(0..archive.len())
.par_bridge()
.map(|file_number| {
let mut archive = archive.clone();
let mut file = archive.by_index(file_number)?;
// Determine the path of the file within the wheel.
let file_path = match file.enclosed_name() {
Some(path) => path.to_owned(),
None => return Ok(()),
};
// Create necessary parent directories.
let path = target.join(file_path);
if file.is_dir() {
fs_err::create_dir_all(path)?;
return Ok(());
}
if let Some(parent) = path.parent() {
fs_err::create_dir_all(parent)?;
}
// Write the file.
let mut outfile = fs_err::File::create(&path)?;
std::io::copy(&mut file, &mut outfile)?;
// Set permissions.
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
if let Some(mode) = file.unix_mode() {
std::fs::set_permissions(&path, Permissions::from_mode(mode))?;
}
}
Ok(())
})
.collect::<Result<_, UnzipError>>()
}

View file

@ -1,3 +0,0 @@
pub(crate) use cloneable_seekable_reader::{CloneableSeekableReader, HasLength};
mod cloneable_seekable_reader;

View file

@ -0,0 +1,22 @@
[package]
name = "puffin-extract"
version = "0.0.1"
edition = { workspace = true }
rust-version = { workspace = true }
homepage = { workspace = true }
documentation = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
[lints]
workspace = true
[dependencies]
flate2 = { workspace = true }
fs-err = { workspace = true }
rayon = { workspace = true }
tar = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
zip = { workspace = true }

View file

@ -0,0 +1,104 @@
use std::path::{Path, PathBuf};
use rayon::prelude::*;
use zip::result::ZipError;
use zip::ZipArchive;
pub use crate::vendor::{CloneableSeekableReader, HasLength};
mod vendor;
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error(transparent)]
Zip(#[from] ZipError),
#[error(transparent)]
Io(#[from] std::io::Error),
#[error("Unsupported archive type: {0}")]
UnsupportedArchive(PathBuf),
}
/// Unzip a zip archive into the target directory.
pub fn unzip_archive<R: Send + std::io::Read + std::io::Seek + HasLength>(
reader: R,
target: &Path,
) -> Result<(), Error> {
// Unzip in parallel.
let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?;
(0..archive.len())
.par_bridge()
.map(|file_number| {
let mut archive = archive.clone();
let mut file = archive.by_index(file_number)?;
// Determine the path of the file within the wheel.
let file_path = match file.enclosed_name() {
Some(path) => path.to_owned(),
None => return Ok(()),
};
// Create necessary parent directories.
let path = target.join(file_path);
if file.is_dir() {
fs_err::create_dir_all(path)?;
return Ok(());
}
if let Some(parent) = path.parent() {
fs_err::create_dir_all(parent)?;
}
// Write the file.
let mut outfile = fs_err::File::create(&path)?;
std::io::copy(&mut file, &mut outfile)?;
// Set permissions.
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
if let Some(mode) = file.unix_mode() {
std::fs::set_permissions(&path, Permissions::from_mode(mode))?;
}
}
Ok(())
})
.collect::<Result<_, Error>>()
}
/// Extract a `.zip` or `.tar.gz` archive into the target directory.
pub fn extract_archive(source: impl AsRef<Path>, target: impl AsRef<Path>) -> Result<(), Error> {
// .zip
if source
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
{
unzip_archive(fs_err::File::open(source.as_ref())?, target.as_ref())?;
return Ok(());
}
// .tar.gz
if source
.as_ref()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
{
if source.as_ref().file_stem().is_some_and(|stem| {
Path::new(stem)
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
}) {
let mut archive = tar::Archive::new(flate2::read::GzDecoder::new(fs_err::File::open(
source.as_ref(),
)?));
// https://github.com/alexcrichton/tar-rs/issues/349
archive.set_preserve_mtime(false);
archive.unpack(target)?;
return Ok(());
}
}
Err(Error::UnsupportedArchive(source.as_ref().to_path_buf()))
}

View file

@ -16,7 +16,8 @@ use std::{
/// A trait to represent some reader which has a total length known in
/// advance. This is roughly equivalent to the nightly
/// [`Seek::stream_len`] API.
pub(crate) trait HasLength {
#[allow(clippy::len_without_is_empty)]
pub trait HasLength {
/// Return the current total length of this stream.
fn len(&self) -> u64;
}
@ -25,7 +26,7 @@ pub(crate) trait HasLength {
/// and thus can be cloned cheaply. It supports seeking; each cloned instance
/// maintains its own pointer into the file, and the underlying instance
/// is seeked prior to each read.
pub(crate) struct CloneableSeekableReader<R: Read + Seek + HasLength> {
pub struct CloneableSeekableReader<R: Read + Seek + HasLength> {
file: Arc<Mutex<R>>,
pos: u64,
// TODO determine and store this once instead of per cloneable file

View file

@ -0,0 +1,3 @@
pub use cloneable_seekable_reader::{CloneableSeekableReader, HasLength};
mod cloneable_seekable_reader;

View file

@ -14,14 +14,15 @@ workspace = true
[dependencies]
distribution-filename = { path = "../distribution-filename" }
distribution-types = { path = "../distribution-types" }
install-wheel-rs = { path = "../install-wheel-rs", default-features = false }
pep440_rs = { path = "../pep440-rs" }
pep508_rs = { path = "../pep508-rs" }
platform-tags = { path = "../platform-tags" }
puffin-cache = { path = "../puffin-cache" }
puffin-client = { path = "../puffin-client" }
distribution-types = { path = "../distribution-types" }
platform-tags = { path = "../platform-tags" }
puffin-distribution = { path = "../puffin-distribution" }
puffin-extract = { path = "../puffin-extract" }
puffin-fs = { path = "../puffin-fs" }
puffin-git = { path = "../puffin-git" }
puffin-interpreter = { path = "../puffin-interpreter" }

View file

@ -1,6 +1,5 @@
use std::cmp::Reverse;
use std::path::PathBuf;
use std::sync::Arc;
use futures::{StreamExt, TryFutureExt};
@ -12,18 +11,16 @@ use distribution_types::{CachedDist, Dist, RemoteSource, SourceDist};
use platform_tags::Tags;
use puffin_cache::Cache;
use puffin_client::RegistryClient;
use puffin_distribution::{
DistributionDatabase, DistributionDatabaseError, LocalWheel, Unzip, UnzipError,
};
use puffin_distribution::{DistributionDatabase, DistributionDatabaseError, LocalWheel, Unzip};
use puffin_traits::{BuildContext, OnceMap};
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("Failed to unzip wheel: {0}")]
Unzip(Dist, #[source] UnzipError),
Unzip(Dist, #[source] puffin_extract::Error),
#[error("Failed to fetch wheel: {0}")]
Fetch(Dist, #[source] DistributionDatabaseError),
/// Should not occur, i've only seen it when another task panicked
/// Should not occur; only seen when another task panicked.
#[error("The task executor is broken, did some other task panic?")]
Join(#[from] JoinError),
#[error("Unzip failed in another thread: {0}")]
@ -160,7 +157,7 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> {
// Unzip the wheel.
let normalized_path = tokio::task::spawn_blocking({
move || -> Result<PathBuf, UnzipError> {
move || -> Result<PathBuf, puffin_extract::Error> {
// Unzip the wheel into a temporary directory.
let parent = download
.target()