Store unzipped wheels in a cache (#49)

This PR massively speeds up the case in which you need to install wheels that already exist in the global cache. The new strategy is as follows: - Download the wheel into the content-addressed cache. - Unzip the wheel into the cache, but ignore content-addressing. It turns out that writing to `cacache` for every file in the zip added a ton of overhead, and I don't see any actual advantages to doing so. Instead, we just unzip the contents into a directory at, e.g., `~/.cache/puffin/django-4.1.5`. - (The unzip itself is now parallelized with Rayon.) - When installing the wheel, we now support unzipping from a directory instead of a zip archive. This required duplicating and tweaking a few functions. - When installing the wheel, we now use reflinks (or copy-on-write links). These have a few fantastic properties: (1) they're extremely cheap to create (on macOS, they are allegedly faster than hard links); (2) they minimize disk space, since we avoid copying files entirely in the vast majority of cases; and (3) if the user then edits a file locally, the cache doesn't get polluted. Orogene, Bun, and soon pnpm all use reflinks. Puffin is now ~15x faster than `pip` for the common case of installing cached data into a fresh environment. Closes https://github.com/astral-sh/puffin/issues/21. Closes https://github.com/astral-sh/puffin/issues/39.
2025-08-15 08:10:15 +00:00 · 2023-10-08 00:04:48 -04:00 · 2023-10-08 00:04:48 -04:00 · 2a846e76b7
commit 2a846e76b7
parent a46887d34b
14 changed files with 723 additions and 175 deletions
--- a/crates/install-wheel-rs/src/script.rs
+++ b/crates/install-wheel-rs/src/script.rs
@ -0,0 +1,78 @@
+use std::collections::{HashMap, HashSet};
+
+use regex::Regex;
+use serde::Serialize;
+
+use crate::Error;
+
+/// Minimal `direct_url.json` schema
+///
+/// <https://packaging.python.org/en/latest/specifications/direct-url/>
+/// <https://www.python.org/dev/peps/pep-0610/>
+#[derive(Serialize)]
+struct DirectUrl {
+    #[allow(clippy::zero_sized_map_values)]
+    archive_info: HashMap<(), ()>,
+    url: String,
+}
+
+/// A script defining the name of the runnable entrypoint and the module and function that should be
+/// run.
+#[cfg(feature = "python_bindings")]
+#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
+#[pyo3::pyclass(dict)]
+pub struct Script {
+    #[pyo3(get)]
+    pub script_name: String,
+    #[pyo3(get)]
+    pub module: String,
+    #[pyo3(get)]
+    pub function: String,
+}
+
+/// A script defining the name of the runnable entrypoint and the module and function that should be
+/// run.
+#[cfg(not(feature = "python_bindings"))]
+#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
+pub struct Script {
+    pub script_name: String,
+    pub module: String,
+    pub function: String,
+}
+
+impl Script {
+    /// Parses a script definition like `foo.bar:main` or `foomod:main_bar [bar,baz]`
+    ///
+    /// <https://packaging.python.org/en/latest/specifications/entry-points/>
+    ///
+    /// Extras are supposed to be ignored, which happens if you pass None for extras
+    pub fn from_value(
+        script_name: &str,
+        value: &str,
+        extras: Option<&[String]>,
+    ) -> Result<Option<Script>, Error> {
+        let script_regex = Regex::new(r"^(?P<module>[\w\d_\-.]+):(?P<function>[\w\d_\-.]+)(?:\s+\[(?P<extras>(?:[^,]+,?\s*)+)\])?$").unwrap();
+
+        let captures = script_regex
+            .captures(value)
+            .ok_or_else(|| Error::InvalidWheel(format!("invalid console script: '{value}'")))?;
+        if let Some(script_extras) = captures.name("extras") {
+            let script_extras = script_extras
+                .as_str()
+                .split(',')
+                .map(|extra| extra.trim().to_string())
+                .collect::<HashSet<String>>();
+            if let Some(extras) = extras {
+                if !script_extras.is_subset(&extras.iter().cloned().collect()) {
+                    return Ok(None);
+                }
+            }
+        }
+
+        Ok(Some(Script {
+            script_name: script_name.to_string(),
+            module: captures.name("module").unwrap().as_str().to_string(),
+            function: captures.name("function").unwrap().as_str().to_string(),
+        }))
+    }
+}