Move archive extraction into its own crate (#647)

We have some shared utilities beyond `puffin-build` and
`puffin-distribution`, and further, I want to be able to access the
sdist archive extraction logic from `puffin-distribution`. This is
really generic, so moving into its own crate.
This commit is contained in:
Charlie Marsh 2023-12-13 23:49:09 -05:00 committed by GitHub
parent 388641643d
commit db7e2dedbb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 174 additions and 134 deletions

View file

@ -20,6 +20,7 @@ pep440_rs = { path = "../pep440-rs" }
platform-tags = { path = "../platform-tags" }
puffin-cache = { path = "../puffin-cache" }
puffin-client = { path = "../puffin-client" }
puffin-extract = { path = "../puffin-extract" }
puffin-fs = { path = "../puffin-fs" }
puffin-git = { path = "../puffin-git" }
puffin-normalize = { path = "../puffin-normalize" }
@ -31,12 +32,10 @@ bytesize = { workspace = true }
fs-err = { workspace = true }
fs2 = { workspace = true }
futures = { workspace = true }
rayon = { workspace = true }
reqwest = { workspace = true }
rustc-hash = { workspace = true }
serde = { workspace = true , features = ["derive"] }
serde_json = { workspace = true }
sha2 = { workspace = true }
tempfile = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }

View file

@ -3,7 +3,7 @@ pub use download::{DiskWheel, InMemoryWheel, LocalWheel};
pub use index::{BuiltWheelIndex, RegistryWheelIndex};
pub use reporter::Reporter;
pub use source_dist::{SourceDistCachedBuilder, SourceDistError};
pub use unzip::{Unzip, UnzipError};
pub use unzip::Unzip;
mod distribution_database;
mod download;
@ -13,4 +13,3 @@ mod locks;
mod reporter;
mod source_dist;
mod unzip;
mod vendor;

View file

@ -1,49 +1,35 @@
use std::io;
use std::io::{Read, Seek};
use std::path::Path;
use rayon::prelude::*;
use thiserror::Error;
use zip::result::ZipError;
use zip::ZipArchive;
use puffin_extract::{unzip_archive, Error};
use crate::download::BuiltWheel;
use crate::vendor::{CloneableSeekableReader, HasLength};
use crate::{DiskWheel, InMemoryWheel, LocalWheel};
#[derive(Debug, Error)]
pub enum UnzipError {
#[error(transparent)]
Zip(#[from] ZipError),
#[error(transparent)]
Io(#[from] io::Error),
}
pub trait Unzip {
/// Unzip a wheel into the target directory.
fn unzip(&self, target: &Path) -> Result<(), UnzipError>;
fn unzip(&self, target: &Path) -> Result<(), Error>;
}
impl Unzip for InMemoryWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(std::io::Cursor::new(&self.buffer), target)
}
}
impl Unzip for DiskWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(fs_err::File::open(&self.path)?, target)
}
}
impl Unzip for BuiltWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
unzip_archive(fs_err::File::open(&self.path)?, target)
}
}
impl Unzip for LocalWheel {
fn unzip(&self, target: &Path) -> Result<(), UnzipError> {
fn unzip(&self, target: &Path) -> Result<(), Error> {
match self {
LocalWheel::InMemory(wheel) => wheel.unzip(target),
LocalWheel::Disk(wheel) => wheel.unzip(target),
@ -51,52 +37,3 @@ impl Unzip for LocalWheel {
}
}
}
/// Unzip a zip archive into the target directory.
fn unzip_archive<R: Send + Read + Seek + HasLength>(
reader: R,
target: &Path,
) -> Result<(), UnzipError> {
// Unzip in parallel.
let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?;
(0..archive.len())
.par_bridge()
.map(|file_number| {
let mut archive = archive.clone();
let mut file = archive.by_index(file_number)?;
// Determine the path of the file within the wheel.
let file_path = match file.enclosed_name() {
Some(path) => path.to_owned(),
None => return Ok(()),
};
// Create necessary parent directories.
let path = target.join(file_path);
if file.is_dir() {
fs_err::create_dir_all(path)?;
return Ok(());
}
if let Some(parent) = path.parent() {
fs_err::create_dir_all(parent)?;
}
// Write the file.
let mut outfile = fs_err::File::create(&path)?;
std::io::copy(&mut file, &mut outfile)?;
// Set permissions.
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
if let Some(mode) = file.unix_mode() {
std::fs::set_permissions(&path, Permissions::from_mode(mode))?;
}
}
Ok(())
})
.collect::<Result<_, UnzipError>>()
}

View file

@ -1,172 +0,0 @@
// Copyright 2022 Google LLC
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#![allow(clippy::cast_sign_loss)]
use std::{
io::{BufReader, Cursor, Read, Seek, SeekFrom},
sync::{Arc, Mutex},
};
/// A trait to represent some reader which has a total length known in
/// advance. This is roughly equivalent to the nightly
/// [`Seek::stream_len`] API.
pub(crate) trait HasLength {
/// Return the current total length of this stream.
fn len(&self) -> u64;
}
/// A [`Read`] which refers to its underlying stream by reference count,
/// and thus can be cloned cheaply. It supports seeking; each cloned instance
/// maintains its own pointer into the file, and the underlying instance
/// is seeked prior to each read.
pub(crate) struct CloneableSeekableReader<R: Read + Seek + HasLength> {
file: Arc<Mutex<R>>,
pos: u64,
// TODO determine and store this once instead of per cloneable file
file_length: Option<u64>,
}
impl<R: Read + Seek + HasLength> Clone for CloneableSeekableReader<R> {
fn clone(&self) -> Self {
Self {
file: self.file.clone(),
pos: self.pos,
file_length: self.file_length,
}
}
}
impl<R: Read + Seek + HasLength> CloneableSeekableReader<R> {
/// Constructor. Takes ownership of the underlying `Read`.
/// You should pass in only streams whose total length you expect
/// to be fixed and unchanging. Odd behavior may occur if the length
/// of the stream changes; any subsequent seeks will not take account
/// of the changed stream length.
pub(crate) fn new(file: R) -> Self {
Self {
file: Arc::new(Mutex::new(file)),
pos: 0u64,
file_length: None,
}
}
/// Determine the length of the underlying stream.
fn ascertain_file_length(&mut self) -> u64 {
self.file_length.unwrap_or_else(|| {
let len = self.file.lock().unwrap().len();
self.file_length = Some(len);
len
})
}
}
impl<R: Read + Seek + HasLength> Read for CloneableSeekableReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let mut underlying_file = self.file.lock().expect("Unable to get underlying file");
// TODO share an object which knows current position to avoid unnecessary
// seeks
underlying_file.seek(SeekFrom::Start(self.pos))?;
let read_result = underlying_file.read(buf);
if let Ok(bytes_read) = read_result {
// TODO, once stabilised, use checked_add_signed
self.pos += bytes_read as u64;
}
read_result
}
}
impl<R: Read + Seek + HasLength> Seek for CloneableSeekableReader<R> {
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
let new_pos = match pos {
SeekFrom::Start(pos) => pos,
SeekFrom::End(offset_from_end) => {
let file_len = self.ascertain_file_length();
if -offset_from_end as u64 > file_len {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"Seek too far backwards",
));
}
// TODO, once stabilised, use checked_add_signed
file_len - (-offset_from_end as u64)
}
// TODO, once stabilised, use checked_add_signed
SeekFrom::Current(offset_from_pos) => {
if offset_from_pos > 0 {
self.pos + (offset_from_pos as u64)
} else {
self.pos - ((-offset_from_pos) as u64)
}
}
};
self.pos = new_pos;
Ok(new_pos)
}
}
impl<R: HasLength> HasLength for BufReader<R> {
fn len(&self) -> u64 {
self.get_ref().len()
}
}
impl HasLength for std::fs::File {
fn len(&self) -> u64 {
self.metadata().unwrap().len()
}
}
impl HasLength for fs_err::File {
fn len(&self) -> u64 {
self.metadata().unwrap().len()
}
}
impl HasLength for Cursor<Vec<u8>> {
fn len(&self) -> u64 {
self.get_ref().len() as u64
}
}
impl HasLength for Cursor<&Vec<u8>> {
fn len(&self) -> u64 {
self.get_ref().len() as u64
}
}
#[cfg(test)]
mod test {
use std::io::{Cursor, Read, Seek, SeekFrom};
use super::CloneableSeekableReader;
#[test]
fn test_cloneable_seekable_reader() {
let buf: Vec<u8> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
let buf = Cursor::new(buf);
let mut reader = CloneableSeekableReader::new(buf);
let mut out = vec![0; 2];
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 0);
assert_eq!(out[1], 1);
assert!(reader.seek(SeekFrom::Start(0)).is_ok());
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 0);
assert_eq!(out[1], 1);
assert!(reader.stream_position().is_ok());
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 2);
assert_eq!(out[1], 3);
assert!(reader.seek(SeekFrom::End(-2)).is_ok());
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 8);
assert_eq!(out[1], 9);
assert!(reader.read_exact(&mut out).is_err());
}
}

View file

@ -1,3 +0,0 @@
pub(crate) use cloneable_seekable_reader::{CloneableSeekableReader, HasLength};
mod cloneable_seekable_reader;