Store unzipped wheels in a cache (#49)

This PR massively speeds up the case in which you need to install wheels
that already exist in the global cache.

The new strategy is as follows:

- Download the wheel into the content-addressed cache.
- Unzip the wheel into the cache, but ignore content-addressing. It
turns out that writing to `cacache` for every file in the zip added a
ton of overhead, and I don't see any actual advantages to doing so.
Instead, we just unzip the contents into a directory at, e.g.,
`~/.cache/puffin/django-4.1.5`.
- (The unzip itself is now parallelized with Rayon.)
- When installing the wheel, we now support unzipping from a directory
instead of a zip archive. This required duplicating and tweaking a few
functions.
- When installing the wheel, we now use reflinks (or copy-on-write
links). These have a few fantastic properties: (1) they're extremely
cheap to create (on macOS, they are allegedly faster than hard links);
(2) they minimize disk space, since we avoid copying files entirely in
the vast majority of cases; and (3) if the user then edits a file
locally, the cache doesn't get polluted. Orogene, Bun, and soon pnpm all
use reflinks.

Puffin is now ~15x faster than `pip` for the common case of installing
cached data into a fresh environment.

Closes https://github.com/astral-sh/puffin/issues/21.

Closes https://github.com/astral-sh/puffin/issues/39.
This commit is contained in:
Charlie Marsh 2023-10-08 00:04:48 -04:00 committed by GitHub
parent a46887d34b
commit 2a846e76b7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 723 additions and 175 deletions

2
.gitignore vendored
View file

@ -1,3 +1,5 @@
.venv
# Generated by Cargo
# will have compiled files and executables
debug/

7
Cargo.lock generated
View file

@ -1128,6 +1128,7 @@ dependencies = [
"plist",
"pyo3",
"rayon",
"reflink-copy",
"regex",
"rfc2047-decoder",
"serde",
@ -1719,12 +1720,14 @@ dependencies = [
"install-wheel-rs",
"puffin-client",
"puffin-interpreter",
"rayon",
"tempfile",
"tokio",
"tokio-util",
"tracing",
"url",
"wheel-filename",
"zip",
]
[[package]]
@ -1954,9 +1957,9 @@ dependencies = [
[[package]]
name = "reflink-copy"
version = "0.1.8"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9e3947399fd46f412918bafde71ec68f9b3505f11ef082eeb80bc7fdf4d7caf"
checksum = "d7e3e017e993f86feeddf8a7fb609ca49f89082309e328e27aefd4a25bb317a4"
dependencies = [
"cfg-if",
"ioctl-sys",

View file

@ -32,6 +32,8 @@ memchr = { version = "2.6.4" }
once_cell = { version = "1.18.0" }
platform-info = { version = "2.0.2" }
plist = { version = "1.5.0" }
rayon = { version = "1.8.0" }
reflink-copy = { version = "0.1.9" }
regex = { version = "1.9.6" }
reqwest = { version = "0.11.22", features = ["json", "gzip", "brotli", "stream"] }
reqwest-middleware = { version = "0.2.3" }

View file

@ -70,15 +70,7 @@ Options:
### Resolution
To compare a warm run of `puffin` to `pip-compile`:
```shell
hyperfine --runs 10 --warmup 3 --prepare "rm -f /tmp/tmp.txt" \
"./target/release/puffin-cli compile requirements.txt" \
"pip-compile requirements.txt -o /tmp/tmp.txt"
```
To compare a cold run of `puffin` to `pip-compile`:
To compare with a cold cache:
```shell
hyperfine --runs 10 --warmup 3 --prepare "rm -f /tmp/tmp.txt" \
@ -86,28 +78,36 @@ hyperfine --runs 10 --warmup 3 --prepare "rm -f /tmp/tmp.txt" \
"pip-compile requirements.txt --rebuild --pip-args '--no-cache-dir' -o /tmp/tmp.txt"
```
### Installation
To compare a warm run of `puffin` to `pip`:
To compare with a warm cache:
```shell
hyperfine --runs 10 --warmup 3 \
"./target/release/puffin-cli sync requirements.txt --ignore-installed" \
"pip install -r requirements.txt --ignore-installed --no-deps"
hyperfine --runs 10 --warmup 3 --prepare "rm -f /tmp/tmp.txt" \
"./target/release/puffin-cli compile requirements.txt" \
"pip-compile requirements.txt -o /tmp/tmp.txt"
```
To compare a cold run of `puffin` to `pip`:
### Installation
To compare with a cold cache:
```shell
hyperfine --runs 10 --warmup 3 \
hyperfine --runs 10 --warmup 3 --prepare "rm -rf .venv && virtualenv .venv && source .venv/bin/activate" \
"./target/release/puffin-cli sync requirements.txt --ignore-installed --no-cache" \
"pip install -r requirements.txt --ignore-installed --no-cache-dir --no-deps"
```
To compare a run in which all requirements are already installed:
To compare with a warm cache:
```shell
hyperfine --runs 10 --warmup 3 \
hyperfine --runs 10 --warmup 3 --prepare "rm -rf .venv && virtualenv .venv && source .venv/bin/activate" \
"./target/release/puffin-cli sync requirements.txt --ignore-installed" \
"pip install -r requirements.txt --ignore-installed --no-deps"
```
To compare with all dependencies already installed:
```shell
hyperfine --runs 10 --warmup 3 --prepare "rm -rf .venv && virtualenv .venv && source .venv/bin/activate" \
"./target/release/puffin-cli sync requirements.txt" \
"pip install -r requirements.txt --no-deps"
```

View file

@ -28,6 +28,7 @@ platform-info = { workspace = true }
plist = { workspace = true }
pyo3 = { version = "0.19.2", features = ["extension-module", "abi3-py37"], optional = true }
rayon = { version = "1.8.0", optional = true }
reflink-copy = { workspace = true }
regex = { workspace = true }
rfc2047-decoder = { workspace = true }
serde = { workspace = true, features = ["derive"] }

View file

@ -1,21 +1,28 @@
//! Takes a wheel and installs it, either in a venv or for monotrail.
use std::io;
use std::io::{Read, Seek};
use platform_info::PlatformInfoError;
use thiserror::Error;
use zip::result::ZipError;
use zip::ZipArchive;
pub use install_location::{normalize_name, InstallLocation, LockedDir};
use platform_host::{Arch, Os};
pub use record::RecordEntry;
pub use script::Script;
pub use wheel::{
get_script_launcher, install_wheel, parse_key_value_file, read_record_file, relative_to,
Script, SHEBANG_PYTHON,
SHEBANG_PYTHON,
};
mod install_location;
#[cfg(feature = "python_bindings")]
mod python_bindings;
mod record;
mod script;
pub mod unpacked;
mod wheel;
#[derive(Error, Debug)]
@ -65,3 +72,13 @@ impl Error {
}
}
}
pub fn do_thing(reader: impl Read + Seek) -> Result<(), Error> {
let x = tempfile::tempdir()?;
let mut archive =
ZipArchive::new(reader).map_err(|err| Error::from_zip_error("(index)".to_string(), err))?;
archive.extract(x.path()).unwrap();
Ok(())
}

View file

@ -0,0 +1,16 @@
use serde::{Deserialize, Serialize};
/// Line in a RECORD file
/// <https://www.python.org/dev/peps/pep-0376/#record>
///
/// ```csv
/// tqdm/cli.py,sha256=x_c8nmc4Huc-lKEsAXj78ZiyqSJ9hJ71j7vltY67icw,10509
/// tqdm-4.62.3.dist-info/RECORD,,
/// ```
#[derive(Deserialize, Serialize, PartialOrd, PartialEq, Ord, Eq)]
pub struct RecordEntry {
pub path: String,
pub hash: Option<String>,
#[allow(dead_code)]
pub size: Option<u64>,
}

View file

@ -0,0 +1,78 @@
use std::collections::{HashMap, HashSet};
use regex::Regex;
use serde::Serialize;
use crate::Error;
/// Minimal `direct_url.json` schema
///
/// <https://packaging.python.org/en/latest/specifications/direct-url/>
/// <https://www.python.org/dev/peps/pep-0610/>
#[derive(Serialize)]
struct DirectUrl {
#[allow(clippy::zero_sized_map_values)]
archive_info: HashMap<(), ()>,
url: String,
}
/// A script defining the name of the runnable entrypoint and the module and function that should be
/// run.
#[cfg(feature = "python_bindings")]
#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
#[pyo3::pyclass(dict)]
pub struct Script {
#[pyo3(get)]
pub script_name: String,
#[pyo3(get)]
pub module: String,
#[pyo3(get)]
pub function: String,
}
/// A script defining the name of the runnable entrypoint and the module and function that should be
/// run.
#[cfg(not(feature = "python_bindings"))]
#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
pub struct Script {
pub script_name: String,
pub module: String,
pub function: String,
}
impl Script {
/// Parses a script definition like `foo.bar:main` or `foomod:main_bar [bar,baz]`
///
/// <https://packaging.python.org/en/latest/specifications/entry-points/>
///
/// Extras are supposed to be ignored, which happens if you pass None for extras
pub fn from_value(
script_name: &str,
value: &str,
extras: Option<&[String]>,
) -> Result<Option<Script>, Error> {
let script_regex = Regex::new(r"^(?P<module>[\w\d_\-.]+):(?P<function>[\w\d_\-.]+)(?:\s+\[(?P<extras>(?:[^,]+,?\s*)+)\])?$").unwrap();
let captures = script_regex
.captures(value)
.ok_or_else(|| Error::InvalidWheel(format!("invalid console script: '{value}'")))?;
if let Some(script_extras) = captures.name("extras") {
let script_extras = script_extras
.as_str()
.split(',')
.map(|extra| extra.trim().to_string())
.collect::<HashSet<String>>();
if let Some(extras) = extras {
if !script_extras.is_subset(&extras.iter().cloned().collect()) {
return Ok(None);
}
}
}
Ok(Some(Script {
script_name: script_name.to_string(),
module: captures.name("module").unwrap().as_str().to_string(),
function: captures.name("function").unwrap().as_str().to_string(),
}))
}
}

View file

@ -0,0 +1,285 @@
//! Like `wheel.rs`, but for installing wheels that have already been unzipped, rather than
//! reading from a zip file.
use std::io::Read;
use std::path::Path;
use configparser::ini::Ini;
use fs_err as fs;
use fs_err::File;
use mailparse::MailHeaderMap;
use tracing::{debug, span, Level};
use walkdir::WalkDir;
use wheel_filename::WheelFilename;
use crate::install_location::{InstallLocation, LockedDir};
use crate::wheel::{
extra_dist_info, install_data, parse_wheel_version, read_scripts_from_section,
write_script_entrypoints,
};
use crate::{read_record_file, Error, Script};
/// Install the given wheel to the given venv
///
/// The caller must ensure that the wheel is compatible to the environment.
///
/// <https://packaging.python.org/en/latest/specifications/binary-distribution-format/#installing-a-wheel-distribution-1-0-py32-none-any-whl>
///
/// Wheel 1.0: <https://www.python.org/dev/peps/pep-0427/>
pub fn install_wheel(
location: &InstallLocation<LockedDir>,
wheel: &Path,
filename: &WheelFilename,
) -> Result<String, Error> {
let name = &filename.distribution;
let _my_span = span!(Level::DEBUG, "install_wheel", name = name.as_str());
let InstallLocation::Venv {
venv_base: base_location,
..
} = location
else {
return Err(Error::InvalidWheel(
"Monotrail installation is not supported yet".to_string(),
));
};
// TODO(charlie): Pass this in.
let site_packages_python = format!(
"python{}.{}",
location.get_python_version().0,
location.get_python_version().1
);
let site_packages = if cfg!(target_os = "windows") {
base_location.join("Lib").join("site-packages")
} else {
base_location
.join("lib")
.join(site_packages_python)
.join("site-packages")
};
debug!(name = name.as_str(), "Getting wheel metadata");
let dist_info_prefix = find_dist_info(wheel)?;
let (name, _version) = read_metadata(&dist_info_prefix, wheel)?;
// TODO: Check that name and version match
// We're going step by step though
// https://packaging.python.org/en/latest/specifications/binary-distribution-format/#installing-a-wheel-distribution-1-0-py32-none-any-whl
// > 1.a Parse distribution-1.0.dist-info/WHEEL.
// > 1.b Check that installer is compatible with Wheel-Version. Warn if minor version is greater, abort if major version is greater.
let wheel_file_path = wheel.join(format!("{dist_info_prefix}.dist-info/WHEEL"));
let wheel_text = std::fs::read_to_string(&wheel_file_path)?;
parse_wheel_version(&wheel_text)?;
// > 1.c If Root-Is-Purelib == true, unpack archive into purelib (site-packages).
// > 1.d Else unpack archive into platlib (site-packages).
// We always install in the same virtualenv site packages
debug!(name = name.as_str(), "Extracting file");
let num_unpacked = unpack_wheel_files(&site_packages, wheel)?;
debug!(name = name.as_str(), "Extracted {num_unpacked} files",);
// Read the RECORD file.
let mut record_file = File::open(&wheel.join(format!("{dist_info_prefix}.dist-info/RECORD")))?;
let mut record = read_record_file(&mut record_file)?;
debug!(name = name.as_str(), "Writing entrypoints");
let (console_scripts, gui_scripts) = parse_scripts(wheel, &dist_info_prefix, None)?;
write_script_entrypoints(&site_packages, location, &console_scripts, &mut record)?;
write_script_entrypoints(&site_packages, location, &gui_scripts, &mut record)?;
let data_dir = site_packages.join(format!("{dist_info_prefix}.data"));
// 2.a Unpacked archive includes distribution-1.0.dist-info/ and (if there is data) distribution-1.0.data/.
// 2.b Move each subtree of distribution-1.0.data/ onto its destination path. Each subdirectory of distribution-1.0.data/ is a key into a dict of destination directories, such as distribution-1.0.data/(purelib|platlib|headers|scripts|data). The initially supported paths are taken from distutils.command.install.
if data_dir.is_dir() {
debug!(name = name.as_str(), "Installing data");
install_data(
base_location,
&site_packages,
&data_dir,
&name,
location,
&console_scripts,
&gui_scripts,
&mut record,
)?;
// 2.c If applicable, update scripts starting with #!python to point to the correct interpreter.
// Script are unsupported through data
// 2.e Remove empty distribution-1.0.data directory.
fs::remove_dir_all(data_dir)?;
} else {
debug!(name = name.as_str(), "No data");
}
debug!(name = name.as_str(), "Writing extra metadata");
extra_dist_info(&site_packages, &dist_info_prefix, true, &mut record)?;
debug!(name = name.as_str(), "Writing record");
let mut record_writer = csv::WriterBuilder::new()
.has_headers(false)
.escape(b'"')
.from_path(site_packages.join(format!("{dist_info_prefix}.dist-info/RECORD")))?;
record.sort();
for entry in record {
record_writer.serialize(entry)?;
}
Ok(filename.get_tag())
}
/// The metadata name may be uppercase, while the wheel and dist info names are lowercase, or
/// the metadata name and the dist info name are lowercase, while the wheel name is uppercase.
/// Either way, we just search the wheel for the name
///
/// <https://github.com/PyO3/python-pkginfo-rs>
fn find_dist_info(path: &Path) -> Result<String, Error> {
// Iterate over `path` to find the `.dist-info` directory. It should be at the top-level.
let Some(dist_info) = std::fs::read_dir(path)?.find_map(|entry| {
let entry = entry.ok()?;
let path = entry.path();
if path.is_dir() {
if path.extension().map_or(false, |ext| ext == "dist-info") {
Some(path)
} else {
None
}
} else {
None
}
}) else {
return Err(Error::InvalidWheel(
"Missing .dist-info directory".to_string(),
));
};
let Some(dist_info_prefix) = dist_info.file_stem() else {
return Err(Error::InvalidWheel(
"Missing .dist-info directory".to_string(),
));
};
Ok(dist_info_prefix.to_string_lossy().to_string())
}
/// <https://github.com/PyO3/python-pkginfo-rs>
fn read_metadata(dist_info_prefix: &str, wheel: &Path) -> Result<(String, String), Error> {
let metadata_file = wheel.join(format!("{dist_info_prefix}.dist-info/METADATA"));
// Read into a buffer.
let mut content = Vec::new();
File::open(&metadata_file)?.read_to_end(&mut content)?;
// HACK: trick mailparse to parse as UTF-8 instead of ASCII
let mut mail = b"Content-Type: text/plain; charset=utf-8\n".to_vec();
mail.extend_from_slice(&content);
let msg = mailparse::parse_mail(&mail).map_err(|err| {
Error::InvalidWheel(format!("Invalid {}: {}", metadata_file.display(), err))
})?;
let headers = msg.get_headers();
let metadata_version =
headers
.get_first_value("Metadata-Version")
.ok_or(Error::InvalidWheel(format!(
"No Metadata-Version field in {}",
metadata_file.display()
)))?;
// Crude but it should do https://packaging.python.org/en/latest/specifications/core-metadata/#metadata-version
// At time of writing:
// > Version of the file format; legal values are “1.0”, “1.1”, “1.2”, “2.1”, “2.2”, and “2.3”.
if !(metadata_version.starts_with("1.") || metadata_version.starts_with("2.")) {
return Err(Error::InvalidWheel(format!(
"Metadata-Version field has unsupported value {metadata_version}"
)));
}
let name = headers
.get_first_value("Name")
.ok_or(Error::InvalidWheel(format!(
"No Name field in {}",
metadata_file.display()
)))?;
let version = headers
.get_first_value("Version")
.ok_or(Error::InvalidWheel(format!(
"No Version field in {}",
metadata_file.display()
)))?;
Ok((name, version))
}
/// Parses the `entry_points.txt` entry in the wheel for console scripts
///
/// Returns (`script_name`, module, function)
///
/// Extras are supposed to be ignored, which happens if you pass None for extras
fn parse_scripts(
wheel: &Path,
dist_info_prefix: &str,
extras: Option<&[String]>,
) -> Result<(Vec<Script>, Vec<Script>), Error> {
let entry_points_path = wheel.join(format!("{dist_info_prefix}.dist-info/entry_points.txt"));
// Read the entry points mapping. If the file doesn't exist, we just return an empty mapping.
let Ok(ini) = std::fs::read_to_string(entry_points_path) else {
return Ok((Vec::new(), Vec::new()));
};
let entry_points_mapping = Ini::new_cs()
.read(ini)
.map_err(|err| Error::InvalidWheel(format!("entry_points.txt is invalid: {err}")))?;
// TODO: handle extras
let console_scripts = match entry_points_mapping.get("console_scripts") {
Some(console_scripts) => {
read_scripts_from_section(console_scripts, "console_scripts", extras)?
}
None => Vec::new(),
};
let gui_scripts = match entry_points_mapping.get("gui_scripts") {
Some(gui_scripts) => read_scripts_from_section(gui_scripts, "gui_scripts", extras)?,
None => Vec::new(),
};
Ok((console_scripts, gui_scripts))
}
/// Extract all files from the wheel into the site packages
///
/// Matches with the RECORD entries
///
/// Returns paths relative to site packages
fn unpack_wheel_files(site_packages: &Path, wheel: &Path) -> Result<usize, Error> {
let mut count = 0usize;
// Walk over the directory.
for entry in WalkDir::new(wheel) {
let entry = entry?;
let relative = entry.path().strip_prefix(wheel).unwrap();
let out_path = site_packages.join(relative);
if entry.file_type().is_dir() {
fs::create_dir_all(&out_path)?;
continue;
}
reflink_copy::reflink_or_copy(entry.path(), &out_path)?;
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
if let Ok(metadata) = entry.metadata() {
fs::set_permissions(
&out_path,
Permissions::from_mode(metadata.permissions().mode()),
)?;
}
}
count += 1;
}
Ok(count)
}

View file

@ -1,5 +1,3 @@
#![allow(clippy::needless_borrow)]
use std::collections::{HashMap, HashSet};
use std::ffi::OsString;
use std::io::{BufRead, BufReader, BufWriter, Cursor, Read, Seek, Write};
@ -12,8 +10,6 @@ use data_encoding::BASE64URL_NOPAD;
use fs_err as fs;
use fs_err::{DirEntry, File};
use mailparse::MailHeaderMap;
use regex::Regex;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use tempfile::{tempdir, TempDir};
use tracing::{debug, error, span, warn, Level};
@ -25,6 +21,8 @@ use zip::{ZipArchive, ZipWriter};
use wheel_filename::WheelFilename;
use crate::install_location::{InstallLocation, LockedDir};
use crate::record::RecordEntry;
use crate::script::Script;
use crate::{normalize_name, Error};
/// `#!/usr/bin/env python`
@ -34,93 +32,6 @@ pub(crate) const LAUNCHER_T32: &[u8] = include_bytes!("../windows-launcher/t32.e
pub(crate) const LAUNCHER_T64: &[u8] = include_bytes!("../windows-launcher/t64.exe");
pub(crate) const LAUNCHER_T64_ARM: &[u8] = include_bytes!("../windows-launcher/t64-arm.exe");
/// Line in a RECORD file
/// <https://www.python.org/dev/peps/pep-0376/#record>
///
/// ```csv
/// tqdm/cli.py,sha256=x_c8nmc4Huc-lKEsAXj78ZiyqSJ9hJ71j7vltY67icw,10509
/// tqdm-4.62.3.dist-info/RECORD,,
/// ```
#[derive(Deserialize, Serialize, PartialOrd, PartialEq, Ord, Eq)]
pub struct RecordEntry {
pub path: String,
pub hash: Option<String>,
#[allow(dead_code)]
pub size: Option<u64>,
}
/// Minimal `direct_url.json` schema
///
/// <https://packaging.python.org/en/latest/specifications/direct-url/>
/// <https://www.python.org/dev/peps/pep-0610/>
#[derive(Serialize)]
struct DirectUrl {
#[allow(clippy::zero_sized_map_values)]
archive_info: HashMap<(), ()>,
url: String,
}
/// A script defining the name of the runnable entrypoint and the module and function that should be
/// run.
#[cfg(feature = "python_bindings")]
#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
#[pyo3::pyclass(dict)]
pub struct Script {
#[pyo3(get)]
pub script_name: String,
#[pyo3(get)]
pub module: String,
#[pyo3(get)]
pub function: String,
}
/// A script defining the name of the runnable entrypoint and the module and function that should be
/// run.
#[cfg(not(feature = "python_bindings"))]
#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
pub struct Script {
pub script_name: String,
pub module: String,
pub function: String,
}
impl Script {
/// Parses a script definition like `foo.bar:main` or `foomod:main_bar [bar,baz]`
///
/// <https://packaging.python.org/en/latest/specifications/entry-points/>
///
/// Extras are supposed to be ignored, which happens if you pass None for extras
pub fn from_value(
script_name: &str,
value: &str,
extras: Option<&[String]>,
) -> Result<Option<Script>, Error> {
let script_regex = Regex::new(r"^(?P<module>[\w\d_\-.]+):(?P<function>[\w\d_\-.]+)(?:\s+\[(?P<extras>(?:[^,]+,?\s*)+)\])?$").unwrap();
let captures = script_regex
.captures(value)
.ok_or_else(|| Error::InvalidWheel(format!("invalid console script: '{value}'")))?;
if let Some(script_extras) = captures.name("extras") {
let script_extras = script_extras
.as_str()
.split(',')
.map(|extra| extra.trim().to_string())
.collect::<HashSet<String>>();
if let Some(extras) = extras {
if !script_extras.is_subset(&extras.iter().cloned().collect()) {
return Ok(None);
}
}
}
Ok(Some(Script {
script_name: script_name.to_string(),
module: captures.name("module").unwrap().as_str().to_string(),
function: captures.name("function").unwrap().as_str().to_string(),
}))
}
}
/// Wrapper script template function
///
/// <https://github.com/pypa/pip/blob/7f8a6844037fb7255cfd0d34ff8e8cf44f2598d4/src/pip/_vendor/distlib/scripts.py#L41-L48>
@ -139,7 +50,7 @@ if __name__ == "__main__":
}
/// Part of entrypoints parsing
fn read_scripts_from_section(
pub(crate) fn read_scripts_from_section(
scripts_section: &HashMap<String, Option<String>>,
section_name: &str,
extras: Option<&[String]>,
@ -268,7 +179,7 @@ fn unpack_wheel_files<R: Read + Seek>(
if let Some(p) = out_path.parent() {
if !created_dirs.contains(p) {
fs::create_dir_all(&p)?;
fs::create_dir_all(p)?;
created_dirs.insert(p.to_path_buf());
}
}
@ -350,7 +261,7 @@ fn unpack_wheel_files<R: Read + Seek>(
Ok(extracted_paths)
}
fn get_shebang(location: &InstallLocation<LockedDir>) -> String {
pub(crate) fn get_shebang(location: &InstallLocation<LockedDir>) -> String {
if matches!(location, InstallLocation::Venv { .. }) {
let path = location.get_python().display().to_string();
let path = if cfg!(windows) {
@ -380,7 +291,7 @@ fn get_shebang(location: &InstallLocation<LockedDir>) -> String {
/// TODO: a nice, reproducible-without-distlib rust solution
///
/// <https://github.com/pypa/pip/blob/fd0ea6bc5e8cb95e518c23d901c26ca14db17f89/src/pip/_vendor/distlib/scripts.py#L248-L262>
fn windows_script_launcher(launcher_python_script: &str) -> Result<Vec<u8>, Error> {
pub(crate) fn windows_script_launcher(launcher_python_script: &str) -> Result<Vec<u8>, Error> {
let launcher_bin = match env::consts::ARCH {
"x84" => LAUNCHER_T32,
"x86_64" => LAUNCHER_T64,
@ -419,7 +330,7 @@ fn windows_script_launcher(launcher_python_script: &str) -> Result<Vec<u8>, Erro
/// We also pass `venv_base` so we can write the same path as pip does
///
/// TODO: Test for this launcher directly in install-wheel-rs
fn write_script_entrypoints(
pub(crate) fn write_script_entrypoints(
site_packages: &Path,
location: &InstallLocation<LockedDir>,
entrypoints: &[Script],
@ -444,7 +355,7 @@ fn write_script_entrypoints(
let launcher_python_script = get_script_launcher(
&entrypoint.module,
&entrypoint.function,
&get_shebang(&location),
&get_shebang(location),
);
if cfg!(windows) {
let launcher = windows_script_launcher(&launcher_python_script)?;
@ -470,7 +381,7 @@ fn write_script_entrypoints(
Ok(())
}
fn bin_rel() -> PathBuf {
pub(crate) fn bin_rel() -> PathBuf {
if cfg!(windows) {
// windows doesn't have the python part, only Lib/site-packages
Path::new("..").join("..").join("Scripts")
@ -484,7 +395,7 @@ fn bin_rel() -> PathBuf {
///
/// > {distribution}-{version}.dist-info/WHEEL is metadata about the archive itself in the same
/// > basic key: value format:
fn parse_wheel_version(wheel_text: &str) -> Result<(), Error> {
pub(crate) fn parse_wheel_version(wheel_text: &str) -> Result<(), Error> {
// {distribution}-{version}.dist-info/WHEEL is metadata about the archive itself in the same basic key: value format:
let data = parse_key_value_file(&mut wheel_text.as_bytes(), "WHEEL")?;
@ -548,7 +459,7 @@ fn bytecode_compile(
let mut retries = 3;
let (status, lines) = loop {
let (status, lines) =
bytecode_compile_inner(site_packages, &py_source_paths, &sys_executable)?;
bytecode_compile_inner(site_packages, &py_source_paths, sys_executable)?;
retries -= 1;
if status.success() || retries == 0 {
break (status, lines);
@ -624,7 +535,7 @@ fn bytecode_compile_inner(
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.current_dir(&site_packages)
.current_dir(site_packages)
.spawn()
.map_err(Error::PythonSubcommand)?;
@ -696,21 +607,21 @@ pub fn relative_to(path: &Path, base: &Path) -> Result<PathBuf, Error> {
}
/// Moves the files and folders in src to dest, updating the RECORD in the process
fn move_folder_recorded(
pub(crate) fn move_folder_recorded(
src_dir: &Path,
dest_dir: &Path,
site_packages: &Path,
record: &mut [RecordEntry],
) -> Result<(), Error> {
if !dest_dir.is_dir() {
fs::create_dir_all(&dest_dir)?;
fs::create_dir_all(dest_dir)?;
}
for entry in WalkDir::new(&src_dir) {
for entry in WalkDir::new(src_dir) {
let entry = entry?;
let src = entry.path();
// This is the base path for moving to the actual target for the data
// e.g. for data it's without <..>.data/data/
let relative_to_data = src.strip_prefix(&src_dir).expect("Prefix must no change");
let relative_to_data = src.strip_prefix(src_dir).expect("Prefix must no change");
// This is the path stored in RECORD
// e.g. for data it's with .data/data/
let relative_to_site_packages = src
@ -733,7 +644,7 @@ fn move_folder_recorded(
src.display()
))
})?;
entry.path = relative_to(&target, &site_packages)?.display().to_string();
entry.path = relative_to(&target, site_packages)?.display().to_string();
}
}
Ok(())
@ -775,7 +686,7 @@ fn install_script(
start.resize(placeholder_python.len(), 0);
script.read_exact(&mut start)?;
let size_and_encoded_hash = if start == placeholder_python {
let start = get_shebang(&location).as_bytes().to_vec();
let start = get_shebang(location).as_bytes().to_vec();
let mut target = File::create(site_packages.join(&target_path))?;
let size_and_encoded_hash = copy_and_hash(&mut start.chain(script), &mut target)?;
fs::remove_file(&path)?;
@ -783,7 +694,7 @@ fn install_script(
} else {
// reading and writing is slow especially for large binaries, so we move them instead
drop(script);
fs::rename(&path, &site_packages.join(&target_path))?;
fs::rename(&path, site_packages.join(&target_path))?;
None
};
#[cfg(unix)]
@ -821,7 +732,7 @@ fn install_script(
/// Move the files from the .data directory to the right location in the venv
#[allow(clippy::too_many_arguments)]
fn install_data(
pub(crate) fn install_data(
venv_base: &Path,
site_packages: &Path,
data_dir: &Path,
@ -858,7 +769,7 @@ fn install_data(
continue;
}
install_script(site_packages, record, &file, &location)?;
install_script(site_packages, record, &file, location)?;
}
}
Some("headers") => {
@ -894,7 +805,7 @@ fn install_data(
///
/// We still the path in the absolute path to the site packages and the relative path in the
/// site packages because we must only record the relative path in RECORD
fn write_file_recorded(
pub(crate) fn write_file_recorded(
site_packages: &Path,
relative_path: &Path,
content: impl AsRef<[u8]>,
@ -912,7 +823,7 @@ fn write_file_recorded(
}
/// Adds INSTALLER, REQUESTED and `direct_url.json` to the .dist-info dir
fn extra_dist_info(
pub(crate) fn extra_dist_info(
site_packages: &Path,
dist_info_prefix: &str,
requested: bool,
@ -933,24 +844,6 @@ fn extra_dist_info(
)?;
}
// https://github.com/python-poetry/poetry/issues/6356
/*
let wheel_path_url = format!("file://{}", wheel_path.canonicalize()?.display());
let direct_url = DirectUrl {
archive_info: HashMap::new(),
url: wheel_path_url,
};
// Map explicitly because we special cased that error
let direct_url_json =
serde_json::to_string(&direct_url).map_err(WheelInstallerError::DirectUrlSerdeJsonError)?;
write_file_recorded(
site_packages,
&dist_info.join("direct_url.json"),
&direct_url_json,
record,
)?;
*/
Ok(())
}
@ -1071,7 +964,7 @@ pub fn install_wheel(
ZipArchive::new(reader).map_err(|err| Error::from_zip_error("(index)".to_string(), err))?;
debug!(name = name.as_str(), "Getting wheel metadata");
let dist_info_prefix = find_dist_info(&filename, &mut archive)?;
let dist_info_prefix = find_dist_info(filename, &mut archive)?;
let (name, _version) = read_metadata(&dist_info_prefix, &mut archive)?;
// TODO: Check that name and version match
@ -1112,8 +1005,8 @@ pub fn install_wheel(
debug!(name = name.as_str(), "Writing entrypoints");
let (console_scripts, gui_scripts) = parse_scripts(&mut archive, &dist_info_prefix, None)?;
write_script_entrypoints(&site_packages, &location, &console_scripts, &mut record)?;
write_script_entrypoints(&site_packages, &location, &gui_scripts, &mut record)?;
write_script_entrypoints(&site_packages, location, &console_scripts, &mut record)?;
write_script_entrypoints(&site_packages, location, &gui_scripts, &mut record)?;
let data_dir = site_packages.join(format!("{dist_info_prefix}.data"));
// 2.a Unpacked archive includes distribution-1.0.dist-info/ and (if there is data) distribution-1.0.data/.
@ -1125,7 +1018,7 @@ pub fn install_wheel(
&site_packages,
&data_dir,
&name,
&location,
location,
&console_scripts,
&gui_scripts,
&mut record,

View file

@ -17,8 +17,10 @@ wheel-filename = { path = "../wheel-filename" }
anyhow = { workspace = true }
cacache = { workspace = true }
rayon = { workspace = true }
tempfile = { workspace = true }
tokio = { workspace = true }
tokio-util = { workspace = true }
tracing = { workspace = true }
url = { workspace = true }
zip = { workspace = true }

View file

@ -3,16 +3,23 @@ use std::str::FromStr;
use anyhow::Result;
use cacache::{Algorithm, Integrity};
use rayon::iter::ParallelBridge;
use rayon::iter::ParallelIterator;
use tokio::task::JoinSet;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::debug;
use url::Url;
use zip::ZipArchive;
use install_wheel_rs::{install_wheel, InstallLocation};
use install_wheel_rs::{unpacked, InstallLocation};
use puffin_client::{File, PypiClient};
use puffin_interpreter::PythonExecutable;
use wheel_filename::WheelFilename;
use crate::vendor::CloneableSeekableReader;
mod vendor;
/// Install a set of wheels into a Python virtual environment.
pub async fn install(
wheels: &[File],
@ -20,10 +27,24 @@ pub async fn install(
client: &PypiClient,
cache: Option<&Path>,
) -> Result<()> {
// Fetch the wheels in parallel.
// Phase 1: Fetch the wheels in parallel.
debug!("Phase 1: Fetching wheels");
let mut fetches = JoinSet::new();
let mut results = Vec::with_capacity(wheels.len());
let mut downloads = Vec::with_capacity(wheels.len());
for wheel in wheels {
let sha256 = wheel.hashes.sha256.clone();
let filename = wheel.filename.clone();
// If the unzipped wheel exists in the cache, skip it.
if let Some(cache) = cache {
if cache.join(&sha256).exists() {
debug!("Found wheel in cache: {:?}", filename);
continue;
}
}
debug!("Fetching wheel: {:?}", filename);
fetches.spawn(fetch_wheel(
wheel.clone(),
client.clone(),
@ -31,36 +52,55 @@ pub async fn install(
));
}
while let Some(result) = fetches.join_next().await.transpose()? {
results.push(result?);
downloads.push(result?);
}
// Install each wheel.
let temp_dir = tempfile::tempdir()?;
// Phase 2: Unpack the wheels into the cache.
debug!("Phase 2: Unpacking wheels");
for wheel in downloads {
let sha256 = wheel.file.hashes.sha256.clone();
let filename = wheel.file.filename.clone();
debug!("Unpacking wheel: {:?}", filename);
// Unzip the wheel.
tokio::task::spawn_blocking({
let target = temp_dir.path().join(&sha256);
move || unzip_wheel(wheel, &target)
})
.await??;
// Write the unzipped wheel to the cache (atomically).
if let Some(cache) = cache {
debug!("Caching wheel: {:?}", filename);
tokio::fs::rename(temp_dir.path().join(&sha256), cache.join(&sha256)).await?;
}
}
// Phase 3: Install each wheel.
debug!("Phase 3: Installing wheels");
let location = InstallLocation::Venv {
venv_base: python.venv().to_path_buf(),
python_version: python.simple_version(),
};
let locked_dir = location.acquire_lock()?;
for wheel in results {
let reader = std::io::Cursor::new(wheel.buffer);
let filename = WheelFilename::from_str(&wheel.file.filename)?;
for wheel in wheels {
let dir = cache
.unwrap_or_else(|| temp_dir.path())
.join(&wheel.hashes.sha256);
let filename = WheelFilename::from_str(&wheel.filename)?;
// TODO(charlie): Should this be async?
install_wheel(
&locked_dir,
reader,
&filename,
false,
false,
&[],
"",
python.executable(),
)?;
unpacked::install_wheel(&locked_dir, &dir, &filename)?;
}
Ok(())
}
#[derive(Debug)]
#[derive(Debug, Clone)]
struct FetchedWheel {
file: File,
buffer: Vec<u8>,
@ -98,3 +138,48 @@ async fn fetch_wheel(
Ok(FetchedWheel { file, buffer })
}
/// Write a wheel into the target directory.
fn unzip_wheel(wheel: FetchedWheel, target: &Path) -> Result<()> {
// Read the wheel into a buffer.
let reader = std::io::Cursor::new(wheel.buffer);
let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?;
// Unzip in parallel.
(0..archive.len())
.par_bridge()
.map(|file_number| {
let mut archive = archive.clone();
let mut file = archive.by_index(file_number)?;
// Determine the path of the file within the wheel.
let file_path = match file.enclosed_name() {
Some(path) => path.to_owned(),
None => return Ok(()),
};
// Create necessary parent directories.
let path = target.join(file_path);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
// Write the file.
let mut outfile = std::fs::File::create(&path)?;
std::io::copy(&mut file, &mut outfile)?;
// Set permissions.
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
if let Some(mode) = file.unix_mode() {
std::fs::set_permissions(&path, Permissions::from_mode(mode))?;
}
}
Ok(())
})
.collect::<Result<_>>()
}

View file

@ -0,0 +1,161 @@
// Copyright 2022 Google LLC
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#![allow(clippy::cast_sign_loss)]
use std::{
fs::File,
io::{BufReader, Cursor, Read, Seek, SeekFrom},
sync::{Arc, Mutex},
};
/// A trait to represent some reader which has a total length known in
/// advance. This is roughly equivalent to the nightly
/// [`Seek::stream_len`] API.
pub(crate) trait HasLength {
/// Return the current total length of this stream.
fn len(&self) -> u64;
}
/// A [`Read`] which refers to its underlying stream by reference count,
/// and thus can be cloned cheaply. It supports seeking; each cloned instance
/// maintains its own pointer into the file, and the underlying instance
/// is seeked prior to each read.
pub(crate) struct CloneableSeekableReader<R: Read + Seek + HasLength> {
file: Arc<Mutex<R>>,
pos: u64,
// TODO determine and store this once instead of per cloneable file
file_length: Option<u64>,
}
impl<R: Read + Seek + HasLength> Clone for CloneableSeekableReader<R> {
fn clone(&self) -> Self {
Self {
file: self.file.clone(),
pos: self.pos,
file_length: self.file_length,
}
}
}
impl<R: Read + Seek + HasLength> CloneableSeekableReader<R> {
/// Constructor. Takes ownership of the underlying `Read`.
/// You should pass in only streams whose total length you expect
/// to be fixed and unchanging. Odd behavior may occur if the length
/// of the stream changes; any subsequent seeks will not take account
/// of the changed stream length.
pub(crate) fn new(file: R) -> Self {
Self {
file: Arc::new(Mutex::new(file)),
pos: 0u64,
file_length: None,
}
}
/// Determine the length of the underlying stream.
fn ascertain_file_length(&mut self) -> u64 {
self.file_length.unwrap_or_else(|| {
let len = self.file.lock().unwrap().len();
self.file_length = Some(len);
len
})
}
}
impl<R: Read + Seek + HasLength> Read for CloneableSeekableReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let mut underlying_file = self.file.lock().expect("Unable to get underlying file");
// TODO share an object which knows current position to avoid unnecessary
// seeks
underlying_file.seek(SeekFrom::Start(self.pos))?;
let read_result = underlying_file.read(buf);
if let Ok(bytes_read) = read_result {
// TODO, once stabilised, use checked_add_signed
self.pos += bytes_read as u64;
}
read_result
}
}
impl<R: Read + Seek + HasLength> Seek for CloneableSeekableReader<R> {
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
let new_pos = match pos {
SeekFrom::Start(pos) => pos,
SeekFrom::End(offset_from_end) => {
let file_len = self.ascertain_file_length();
if -offset_from_end as u64 > file_len {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"Seek too far backwards",
));
}
// TODO, once stabilised, use checked_add_signed
file_len - (-offset_from_end as u64)
}
// TODO, once stabilised, use checked_add_signed
SeekFrom::Current(offset_from_pos) => {
if offset_from_pos > 0 {
self.pos + (offset_from_pos as u64)
} else {
self.pos - ((-offset_from_pos) as u64)
}
}
};
self.pos = new_pos;
Ok(new_pos)
}
}
impl<R: HasLength> HasLength for BufReader<R> {
fn len(&self) -> u64 {
self.get_ref().len()
}
}
impl HasLength for File {
fn len(&self) -> u64 {
self.metadata().unwrap().len()
}
}
impl HasLength for Cursor<Vec<u8>> {
fn len(&self) -> u64 {
self.get_ref().len() as u64
}
}
#[cfg(test)]
mod test {
use std::io::{Cursor, Read, Seek, SeekFrom};
use super::CloneableSeekableReader;
#[test]
fn test_cloneable_seekable_reader() {
let buf: Vec<u8> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
let buf = Cursor::new(buf);
let mut reader = CloneableSeekableReader::new(buf);
let mut out = vec![0; 2];
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 0);
assert_eq!(out[1], 1);
assert!(reader.seek(SeekFrom::Start(0)).is_ok());
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 0);
assert_eq!(out[1], 1);
assert!(reader.stream_position().is_ok());
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 2);
assert_eq!(out[1], 3);
assert!(reader.seek(SeekFrom::End(-2)).is_ok());
assert!(reader.read_exact(&mut out).is_ok());
assert_eq!(out[0], 8);
assert_eq!(out[1], 9);
assert!(reader.read_exact(&mut out).is_err());
}
}

View file

@ -0,0 +1,3 @@
pub(crate) use cloneable_seekable_reader::CloneableSeekableReader;
mod cloneable_seekable_reader;