Switch to msgpack in the cached client (#662)

This gives a 1.23 speedup on transformers-extras. We could change to
msgpack for the entire cache if we want. I only tried this format and
postcard so far, where postcard was much slower (like 1.6s).

I don't actually want to merge it like this, i wanted to figure out the
ballpark of improvement for switching away from json.

```
hyperfine --warmup 3 --runs 10 "target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in" "target/profiling/branch pip-compile scripts/requirements/transformers-extras.in"
Benchmark 1: target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in
  Time (mean ± σ):     179.1 ms ±   4.8 ms    [User: 157.5 ms, System: 48.1 ms]
  Range (min … max):   174.9 ms … 188.1 ms    10 runs

Benchmark 2: target/profiling/branch pip-compile scripts/requirements/transformers-extras.in
  Time (mean ± σ):     221.1 ms ±   6.7 ms    [User: 208.1 ms, System: 46.5 ms]
  Range (min … max):   213.5 ms … 235.5 ms    10 runs

Summary
  target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in ran
    1.23 ± 0.05 times faster than target/profiling/branch pip-compile scripts/requirements/transformers-extras.in
```

Disadvantage: We can't manually look into the cache anymore to debug
things

- [ ] Check more formats, i currently only tested json, msgpack and
postcard, there should be other formats, too
- [x] Switch over `CachedByTimestamp` serialization (for the interpreter
caching)
- [x] Switch over error handling and make sure puffin is still resilient
to cache failure
This commit is contained in:
konsti 2023-12-16 22:01:35 +01:00 committed by GitHub
parent e4673a0c52
commit 71964ec7a8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 117 additions and 66 deletions

32
Cargo.lock generated
View file

@ -1971,6 +1971,12 @@ dependencies = [
"windows-targets 0.48.5", "windows-targets 0.48.5",
] ]
[[package]]
name = "paste"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]] [[package]]
name = "pep440_rs" name = "pep440_rs"
version = "0.3.12" version = "0.3.12"
@ -2373,6 +2379,7 @@ dependencies = [
"reqwest", "reqwest",
"reqwest-middleware", "reqwest-middleware",
"reqwest-retry", "reqwest-retry",
"rmp-serde",
"serde", "serde",
"serde_json", "serde_json",
"sha2", "sha2",
@ -2474,9 +2481,9 @@ dependencies = [
"puffin-traits", "puffin-traits",
"pypi-types", "pypi-types",
"reqwest", "reqwest",
"rmp-serde",
"rustc-hash", "rustc-hash",
"serde", "serde",
"serde_json",
"tempfile", "tempfile",
"thiserror", "thiserror",
"tokio", "tokio",
@ -2571,6 +2578,7 @@ dependencies = [
"platform-host", "platform-host",
"puffin-cache", "puffin-cache",
"puffin-fs", "puffin-fs",
"rmp-serde",
"serde", "serde",
"serde_json", "serde_json",
"tempfile", "tempfile",
@ -3091,6 +3099,28 @@ dependencies = [
"windows-sys 0.48.0", "windows-sys 0.48.0",
] ]
[[package]]
name = "rmp"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f9860a6cc38ed1da53456442089b4dfa35e7cedaa326df63017af88385e6b20"
dependencies = [
"byteorder",
"num-traits",
"paste",
]
[[package]]
name = "rmp-serde"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bffea85eea980d8a74453e5d02a8d93028f3c34725de143085a844ebe953258a"
dependencies = [
"byteorder",
"rmp",
"serde",
]
[[package]] [[package]]
name = "rustc-demangle" name = "rustc-demangle"
version = "0.1.23" version = "0.1.23"

View file

@ -64,6 +64,7 @@ reqwest = { version = "0.11.22", default-features = false, features = ["json", "
reqwest-middleware = { version = "0.2.4" } reqwest-middleware = { version = "0.2.4" }
reqwest-retry = { version = "0.3.0" } reqwest-retry = { version = "0.3.0" }
rfc2047-decoder = { version = "1.0.1" } rfc2047-decoder = { version = "1.0.1" }
rmp-serde = { version = "1.1.2" }
rustc-hash = { version = "1.1.0" } rustc-hash = { version = "1.1.0" }
seahash = { version = "4.1.0" } seahash = { version = "4.1.0" }
serde = { version = "1.0.190" } serde = { version = "1.0.190" }

View file

@ -173,15 +173,15 @@ impl Cache {
pub enum CacheBucket { pub enum CacheBucket {
/// Wheels (excluding built wheels), alongside their metadata and cache policy. /// Wheels (excluding built wheels), alongside their metadata and cache policy.
/// ///
/// There are three kinds from cache entries: Wheel metadata and policy as JSON files, the /// There are three kinds from cache entries: Wheel metadata and policy as MsgPack files, the
/// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory /// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory
/// size threshold, we first download the zip file into the cache, then unzip it into a /// size threshold, we first download the zip file into the cache, then unzip it into a
/// directory with the same name (exclusive of the `.whl` extension). /// directory with the same name (exclusive of the `.whl` extension).
/// ///
/// Cache structure: /// Cache structure:
/// * `wheel-metadata-v0/pypi/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` /// * `wheel-metadata-v0/pypi/foo/{foo-1.0.0-py3-none-any.msgpack, foo-1.0.0-py3-none-any.whl}`
/// * `wheel-metadata-v0/<digest(index-url)>/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` /// * `wheel-metadata-v0/<digest(index-url)>/foo/{foo-1.0.0-py3-none-any.msgpack, foo-1.0.0-py3-none-any.whl}`
/// * `wheel-metadata-v0/url/<digest(url)>/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` /// * `wheel-metadata-v0/url/<digest(url)>/foo/{foo-1.0.0-py3-none-any.msgpack, foo-1.0.0-py3-none-any.whl}`
/// ///
/// See `puffin_client::RegistryClient::wheel_metadata` for information on how wheel metadata /// See `puffin_client::RegistryClient::wheel_metadata` for information on how wheel metadata
/// is fetched. /// is fetched.
@ -203,12 +203,12 @@ pub enum CacheBucket {
/// ├── pypi /// ├── pypi
/// │ ... /// │ ...
/// │ ├── pandas /// │ ├── pandas
/// │ │ └── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.json /// │ │ └── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.msgpack
/// │ ... /// │ ...
/// └── url /// └── url
/// └── 4b8be67c801a7ecb /// └── 4b8be67c801a7ecb
/// └── flask /// └── flask
/// └── flask-3.0.0-py3-none-any.json /// └── flask-3.0.0-py3-none-any.msgpack
/// ``` /// ```
/// ///
/// We get the following `requirement.txt` from `pip-compile`: /// We get the following `requirement.txt` from `pip-compile`:
@ -250,7 +250,7 @@ pub enum CacheBucket {
/// ├── pypi /// ├── pypi
/// │ ├── ... /// │ ├── ...
/// │ ├── pandas /// │ ├── pandas
/// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.msgpack
/// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
/// │ │ └── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64 /// │ │ └── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64
/// │ │ ├── pandas /// │ │ ├── pandas
@ -262,8 +262,8 @@ pub enum CacheBucket {
/// └── url /// └── url
/// └── 4b8be67c801a7ecb /// └── 4b8be67c801a7ecb
/// └── flask /// └── flask
/// ├── flask-3.0.0-py3-none-any.json /// ├── flask-3.0.0-py3-none-any.msgpack
/// ├── flask-3.0.0-py3-none-any.json /// ├── flask-3.0.0-py3-none-any.msgpack
/// └── flask-3.0.0-py3-none-any /// └── flask-3.0.0-py3-none-any
/// ├── flask /// ├── flask
/// │ └── ... /// │ └── ...
@ -287,15 +287,15 @@ pub enum CacheBucket {
/// directories in the cache. /// directories in the cache.
/// ///
/// Cache structure: /// Cache structure:
/// * `built-wheels-v0/pypi/foo/34a17436ed1e9669/{metadata.json, foo-1.0.0.zip, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/pypi/foo/34a17436ed1e9669/{metadata.msgpack, foo-1.0.0.zip, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// * `built-wheels-v0/<digest(index-url)>/foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/<digest(index-url)>/foo/foo-1.0.0.zip/{metadata.msgpack, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// * `built-wheels-v0/url/<digest(url)>/foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/url/<digest(url)>/foo/foo-1.0.0.zip/{metadata.msgpack, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// * `built-wheels-v0/git/<digest(url)>/<git sha>/foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/git/<digest(url)>/<git sha>/foo/foo-1.0.0.zip/{metadata.msgpack, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// ///
/// But the url filename does not need to be a valid source dist filename /// But the url filename does not need to be a valid source dist filename
/// (<https://github.com/search?q=path%3A**%2Frequirements.txt+master.zip&type=code>), /// (<https://github.com/search?q=path%3A**%2Frequirements.txt+master.zip&type=code>),
/// so it could also be the following and we have to take any string as filename: /// so it could also be the following and we have to take any string as filename:
/// * `built-wheels-v0/url/<sha256(url)>/master.zip/metadata.json` /// * `built-wheels-v0/url/<sha256(url)>/master.zip/metadata.msgpack`
/// ///
/// # Example /// # Example
/// ///
@ -315,22 +315,22 @@ pub enum CacheBucket {
/// ├── git /// ├── git
/// │ └── a67db8ed076e3814 /// │ └── a67db8ed076e3814
/// │ └── 843b753e9e8cb74e83cac55598719b39a4d5ef1f /// │ └── 843b753e9e8cb74e83cac55598719b39a4d5ef1f
/// │ ├── metadata.json /// │ ├── metadata.msgpack
/// │ └── pydantic_extra_types-2.1.0-py3-none-any.whl /// │ └── pydantic_extra_types-2.1.0-py3-none-any.whl
/// ├── pypi /// ├── pypi
/// │ └── django /// │ └── django
/// │ └── django-allauth-0.51.0.tar.gz /// │ └── django-allauth-0.51.0.tar.gz
/// │ ├── django_allauth-0.51.0-py3-none-any.whl /// │ ├── django_allauth-0.51.0-py3-none-any.whl
/// │ └── metadata.json /// │ └── metadata.msgpack
/// └── url /// └── url
/// └── 6781bd6440ae72c2 /// └── 6781bd6440ae72c2
/// └── werkzeug /// └── werkzeug
/// └── werkzeug-3.0.1.tar.gz /// └── werkzeug-3.0.1.tar.gz
/// ├── metadata.json /// ├── metadata.msgpack
/// └── werkzeug-3.0.1-py3-none-any.whl /// └── werkzeug-3.0.1-py3-none-any.whl
/// ``` /// ```
/// ///
/// The inside of a `metadata.json`: /// Structurally, the inside of a `metadata.msgpack` looks like:
/// ```json /// ```json
/// { /// {
/// "data": { /// "data": {
@ -352,11 +352,11 @@ pub enum CacheBucket {
/// without the shim itself changing, we only cache when the path equals `sys.executable`, i.e. /// without the shim itself changing, we only cache when the path equals `sys.executable`, i.e.
/// the path we're running is the python executable itself and not a shim. /// the path we're running is the python executable itself and not a shim.
/// ///
/// Cache structure: `interpreter-v0/<digest(path)>.json` /// Cache structure: `interpreter-v0/<digest(path)>.msgpack`
/// ///
/// # Example /// # Example
/// ///
/// The contents of each of the json files has a timestamp field in unix time, the [PEP 508] /// The contents of each of the MsgPack files has a timestamp field in unix time, the [PEP 508]
/// markers and some information from the `sys`/`sysconfig` modules. /// markers and some information from the `sys`/`sysconfig` modules.
/// ///
/// ```json /// ```json
@ -388,8 +388,8 @@ pub enum CacheBucket {
/// Index responses through the simple metadata API. /// Index responses through the simple metadata API.
/// ///
/// Cache structure: /// Cache structure:
/// * `simple-v0/pypi/<package_name>.json` /// * `simple-v0/pypi/<package_name>.msgpack`
/// * `simple-v0/<digest(index_url)>/<package_name>.json` /// * `simple-v0/<digest(index_url)>/<package_name>.msgpack`
/// ///
/// The response is parsed into [`puffin_client::SimpleMetadata`] before storage. /// The response is parsed into [`puffin_client::SimpleMetadata`] before storage.
Simple, Simple,
@ -492,17 +492,17 @@ impl CacheBucket {
} }
} }
CacheBucket::Simple => { CacheBucket::Simple => {
// For `pypi` wheels, we expect a JSON file per package, indexed by name. // For `pypi` wheels, we expect a MsgPack file per package, indexed by name.
let root = cache.bucket(self).join(WheelCacheKind::Pypi); let root = cache.bucket(self).join(WheelCacheKind::Pypi);
if remove(root.join(format!("{name}.json")))? { if remove(root.join(format!("{name}.msgpack")))? {
count += 1; count += 1;
} }
// For alternate indices, we expect a directory for every index, followed by a // For alternate indices, we expect a directory for every index, followed by a
// JSON file per package, indexed by name. // MsgPack file per package, indexed by name.
let root = cache.bucket(self).join(WheelCacheKind::Url); let root = cache.bucket(self).join(WheelCacheKind::Url);
for directory in directories(root) { for directory in directories(root) {
if remove(directory.join(format!("{name}.json")))? { if remove(directory.join(format!("{name}.msgpack")))? {
count += 1; count += 1;
} }
} }

View file

@ -15,18 +15,19 @@ pypi-types = { path = "../pypi-types" }
async_http_range_reader = { workspace = true } async_http_range_reader = { workspace = true }
async_zip = { workspace = true, features = ["tokio"] } async_zip = { workspace = true, features = ["tokio"] }
futures = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] } fs-err = { workspace = true, features = ["tokio"] }
futures = { workspace = true }
http = { workspace = true } http = { workspace = true }
http-cache-semantics = { workspace = true } http-cache-semantics = { workspace = true }
reqwest = { workspace = true } reqwest = { workspace = true }
reqwest-middleware = { workspace = true } reqwest-middleware = { workspace = true }
reqwest-retry = { workspace = true } reqwest-retry = { workspace = true }
rmp-serde = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
sha2 = { workspace = true } sha2 = { workspace = true }
thiserror = { workspace = true }
tempfile = { workspace = true } tempfile = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true, features = ["fs"] } tokio = { workspace = true, features = ["fs"] }
tokio-util = { workspace = true } tokio-util = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }

View file

@ -101,7 +101,7 @@ impl CachedClient {
CallbackReturn: Future<Output = Result<Payload, CallBackError>>, CallbackReturn: Future<Output = Result<Payload, CallBackError>>,
{ {
let cached = if let Ok(cached) = fs_err::tokio::read(cache_entry.path()).await { let cached = if let Ok(cached) = fs_err::tokio::read(cache_entry.path()).await {
match serde_json::from_slice::<DataWithCachePolicy<Payload>>(&cached) { match rmp_serde::from_slice::<DataWithCachePolicy<Payload>>(&cached) {
Ok(data) => Some(data), Ok(data) => Some(data),
Err(err) => { Err(err) => {
warn!( warn!(
@ -123,7 +123,7 @@ impl CachedClient {
CachedResponse::NotModified(data_with_cache_policy) => { CachedResponse::NotModified(data_with_cache_policy) => {
write_atomic( write_atomic(
cache_entry.path(), cache_entry.path(),
serde_json::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?, rmp_serde::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?,
) )
.await .await
.map_err(crate::Error::CacheWrite)?; .map_err(crate::Error::CacheWrite)?;
@ -139,7 +139,7 @@ impl CachedClient {
.await .await
.map_err(crate::Error::CacheWrite)?; .map_err(crate::Error::CacheWrite)?;
let data = let data =
serde_json::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?; rmp_serde::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?;
write_atomic(cache_entry.path(), data) write_atomic(cache_entry.path(), data)
.await .await
.map_err(crate::Error::CacheWrite)?; .map_err(crate::Error::CacheWrite)?;

View file

@ -73,12 +73,15 @@ pub enum Error {
#[error(transparent)] #[error(transparent)]
Io(#[from] io::Error), Io(#[from] io::Error),
#[error("Cache deserialization failed")]
Decode(#[from] rmp_serde::decode::Error),
#[error("Cache serialization failed")]
Encode(#[from] rmp_serde::encode::Error),
/// An [`io::Error`] with a filename attached /// An [`io::Error`] with a filename attached
#[error(transparent)] #[error(transparent)]
Persist(#[from] tempfile::PersistError), Persist(#[from] tempfile::PersistError),
#[error("Failed to serialize response to cache")]
SerdeJson(#[from] serde_json::Error),
} }
impl Error { impl Error {

View file

@ -145,7 +145,7 @@ impl RegistryClient {
IndexUrl::Pypi => "pypi".to_string(), IndexUrl::Pypi => "pypi".to_string(),
IndexUrl::Url(url) => digest(&CanonicalUrl::new(url)), IndexUrl::Url(url) => digest(&CanonicalUrl::new(url)),
}), }),
format!("{}.json", package_name.as_ref()), format!("{}.msgpack", package_name.as_ref()),
); );
let simple_request = self let simple_request = self
@ -246,7 +246,7 @@ impl RegistryClient {
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Index(&index).remote_wheel_dir(filename.name.as_ref()), WheelCache::Index(&index).remote_wheel_dir(filename.name.as_ref()),
format!("{}.json", filename.stem()), format!("{}.msgpack", filename.stem()),
); );
let response_callback = |response: Response| async { let response_callback = |response: Response| async {
@ -281,7 +281,7 @@ impl RegistryClient {
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
cache_shard.remote_wheel_dir(filename.name.as_ref()), cache_shard.remote_wheel_dir(filename.name.as_ref()),
format!("{}.json", filename.stem()), format!("{}.msgpack", filename.stem()),
); );
// This response callback is special, we actually make a number of subsequent requests to // This response callback is special, we actually make a number of subsequent requests to

View file

@ -34,9 +34,9 @@ fs-err = { workspace = true }
fs2 = { workspace = true } fs2 = { workspace = true }
futures = { workspace = true } futures = { workspace = true }
reqwest = { workspace = true } reqwest = { workspace = true }
rmp-serde = { workspace = true }
rustc-hash = { workspace = true } rustc-hash = { workspace = true }
serde = { workspace = true , features = ["derive"] } serde = { workspace = true , features = ["derive"] }
serde_json = { workspace = true }
tempfile = { workspace = true } tempfile = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }

View file

@ -56,8 +56,10 @@ pub enum SourceDistError {
// Cache writing error // Cache writing error
#[error("Failed to write to source dist cache")] #[error("Failed to write to source dist cache")]
Io(#[from] std::io::Error), Io(#[from] std::io::Error),
#[error("Cache (de)serialization failed")] #[error("Cache deserialization failed")]
Serde(#[from] serde_json::Error), Decode(#[from] rmp_serde::decode::Error),
#[error("Cache serialization failed")]
Encode(#[from] rmp_serde::encode::Error),
// Build error // Build error
#[error("Failed to build: {0}")] #[error("Failed to build: {0}")]
@ -179,7 +181,8 @@ pub struct SourceDistCachedBuilder<'a, T: BuildContext> {
tags: &'a Tags, tags: &'a Tags,
} }
const METADATA_JSON: &str = "metadata.json"; /// The name of the file that contains the cached metadata, encoded via `MsgPack`.
const METADATA: &str = "metadata.msgpack";
impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
/// Initialize a [`SourceDistCachedBuilder`] from a [`BuildContext`]. /// Initialize a [`SourceDistCachedBuilder`] from a [`BuildContext`].
@ -268,7 +271,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
cache_shard: &CacheShard, cache_shard: &CacheShard,
subdirectory: Option<&'data Path>, subdirectory: Option<&'data Path>,
) -> Result<BuiltWheelMetadata, SourceDistError> { ) -> Result<BuiltWheelMetadata, SourceDistError> {
let cache_entry = cache_shard.entry(METADATA_JSON.to_string()); let cache_entry = cache_shard.entry(METADATA.to_string());
let response_callback = |response| async { let response_callback = |response| async {
// At this point, we're seeing a new or updated source distribution; delete all // At this point, we're seeing a new or updated source distribution; delete all
@ -368,12 +371,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
if let Ok(cached) = fs::read(cache_entry.path()).await { if let Ok(cached) = fs::read(cache_entry.path()).await {
// If the file exists and it was just read or written by `CachedClient`, we assume it must // If the file exists and it was just read or written by `CachedClient`, we assume it must
// be correct. // be correct.
let mut cached = serde_json::from_slice::<DataWithCachePolicy<Manifest>>(&cached)?; let mut cached = rmp_serde::from_slice::<DataWithCachePolicy<Manifest>>(&cached)?;
cached cached
.data .data
.insert(wheel_filename.clone(), cached_data.clone()); .insert(wheel_filename.clone(), cached_data.clone());
write_atomic(cache_entry.path(), serde_json::to_vec(&cached)?).await?; write_atomic(cache_entry.path(), rmp_serde::to_vec(&cached)?).await?;
}; };
Ok(BuiltWheelMetadata::from_cached( Ok(BuiltWheelMetadata::from_cached(
@ -393,7 +396,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
CacheBucket::BuiltWheels, CacheBucket::BuiltWheels,
WheelCache::Path(&path_source_dist.url) WheelCache::Path(&path_source_dist.url)
.remote_wheel_dir(path_source_dist.name().as_ref()), .remote_wheel_dir(path_source_dist.name().as_ref()),
METADATA_JSON.to_string(), METADATA.to_string(),
); );
// Determine the last-modified time of the source distribution. // Determine the last-modified time of the source distribution.
@ -464,7 +467,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
timestamp: modified, timestamp: modified,
data: manifest, data: manifest,
}; };
let data = serde_json::to_vec(&cached)?; let data = rmp_serde::to_vec(&cached)?;
write_atomic(cache_entry.path(), data).await?; write_atomic(cache_entry.path(), data).await?;
if let Some(task) = task { if let Some(task) = task {
@ -498,7 +501,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
CacheBucket::BuiltWheels, CacheBucket::BuiltWheels,
WheelCache::Git(&git_source_dist.url, &git_sha.to_short_string()) WheelCache::Git(&git_source_dist.url, &git_sha.to_short_string())
.remote_wheel_dir(git_source_dist.name().as_ref()), .remote_wheel_dir(git_source_dist.name().as_ref()),
METADATA_JSON.to_string(), METADATA.to_string(),
); );
// Read the existing metadata from the cache. // Read the existing metadata from the cache.
@ -540,7 +543,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
metadata: metadata.clone(), metadata: metadata.clone(),
}, },
); );
let data = serde_json::to_vec(&manifest)?; let data = rmp_serde::to_vec(&manifest)?;
write_atomic(cache_entry.path(), data).await?; write_atomic(cache_entry.path(), data).await?;
if let Some(task) = task { if let Some(task) = task {
@ -707,7 +710,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
) -> Result<Option<Manifest>, SourceDistError> { ) -> Result<Option<Manifest>, SourceDistError> {
match fs::read(&cache_entry.path()).await { match fs::read(&cache_entry.path()).await {
Ok(cached) => { Ok(cached) => {
let cached = serde_json::from_slice::<CachedByTimestamp<Manifest>>(&cached)?; let cached = rmp_serde::from_slice::<CachedByTimestamp<Manifest>>(&cached)?;
if cached.timestamp == modified { if cached.timestamp == modified {
Ok(Some(cached.data)) Ok(Some(cached.data))
} else { } else {
@ -729,7 +732,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
/// Read an existing cache entry, if it exists. /// Read an existing cache entry, if it exists.
async fn read_metadata(cache_entry: &CacheEntry) -> Result<Option<Manifest>, SourceDistError> { async fn read_metadata(cache_entry: &CacheEntry) -> Result<Option<Manifest>, SourceDistError> {
match fs::read(&cache_entry.path()).await { match fs::read(&cache_entry.path()).await {
Ok(cached) => Ok(Some(serde_json::from_slice::<Manifest>(&cached)?)), Ok(cached) => Ok(Some(rmp_serde::from_slice::<Manifest>(&cached)?)),
Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(err) => Err(err.into()), Err(err) => Err(err.into()),
} }

View file

@ -20,11 +20,12 @@ puffin-cache = { path = "../puffin-cache" }
puffin-fs = { path = "../puffin-fs" } puffin-fs = { path = "../puffin-fs" }
fs-err = { workspace = true, features = ["tokio"] } fs-err = { workspace = true, features = ["tokio"] }
rmp-serde = { workspace = true }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true } serde_json = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
serde = { workspace = true, features = ["derive"] }
[dev-dependencies] [dev-dependencies]
indoc = { version = "2.0.4" } indoc = { version = "2.0.4" }

View file

@ -3,7 +3,7 @@ use std::process::Command;
use fs_err as fs; use fs_err as fs;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tracing::debug; use tracing::{debug, warn};
use pep440_rs::Version; use pep440_rs::Version;
use pep508_rs::MarkerEnvironment; use pep508_rs::MarkerEnvironment;
@ -133,9 +133,8 @@ impl InterpreterQueryResult {
let data = serde_json::from_slice::<Self>(&output.stdout).map_err(|err| { let data = serde_json::from_slice::<Self>(&output.stdout).map_err(|err| {
Error::PythonSubcommandOutput { Error::PythonSubcommandOutput {
message: format!( message: format!(
"Querying python at {} did not return the expected data: {}", "Querying python at {} did not return the expected data: {err}",
interpreter.display(), interpreter.display(),
err,
), ),
stdout: String::from_utf8_lossy(&output.stdout).trim().to_string(), stdout: String::from_utf8_lossy(&output.stdout).trim().to_string(),
stderr: String::from_utf8_lossy(&output.stderr).trim().to_string(), stderr: String::from_utf8_lossy(&output.stderr).trim().to_string(),
@ -155,7 +154,7 @@ impl InterpreterQueryResult {
let cache_entry = cache.entry( let cache_entry = cache.entry(
CacheBucket::Interpreter, CacheBucket::Interpreter,
"", "",
format!("{}.json", digest(&executable_bytes)), format!("{}.msgpack", digest(&executable_bytes)),
); );
// `modified()` is infallible on windows and unix (i.e., all platforms we support). // `modified()` is infallible on windows and unix (i.e., all platforms we support).
@ -163,16 +162,25 @@ impl InterpreterQueryResult {
// Read from the cache. // Read from the cache.
if let Ok(data) = fs::read(cache_entry.path()) { if let Ok(data) = fs::read(cache_entry.path()) {
if let Ok(cached) = serde_json::from_slice::<CachedByTimestamp<Self>>(&data) { match rmp_serde::from_slice::<CachedByTimestamp<Self>>(&data) {
if cached.timestamp == modified { Ok(cached) => {
debug!("Using cached markers for: {}", executable.display()); if cached.timestamp == modified {
return Ok(cached.data); debug!("Using cached markers for: {}", executable.display());
} return Ok(cached.data);
}
debug!( debug!(
"Ignoring stale cached markers for: {}", "Ignoring stale cached markers for: {}",
executable.display() executable.display()
); );
}
Err(err) => {
warn!(
"Broken cache entry at {}, removing: {err}",
cache_entry.path().display()
);
let _ = fs_err::remove_file(cache_entry.path());
}
} }
} }
@ -187,7 +195,7 @@ impl InterpreterQueryResult {
// Write to the cache. // Write to the cache.
write_atomic_sync( write_atomic_sync(
cache_entry.path(), cache_entry.path(),
serde_json::to_vec(&CachedByTimestamp { rmp_serde::to_vec(&CachedByTimestamp {
timestamp: modified, timestamp: modified,
data: info.clone(), data: info.clone(),
})?, })?,

View file

@ -40,6 +40,10 @@ pub enum Error {
}, },
#[error("Failed to write to cache")] #[error("Failed to write to cache")]
Serde(#[from] serde_json::Error), Serde(#[from] serde_json::Error),
#[error("Cache deserialization failed")]
Decode(#[from] rmp_serde::decode::Error),
#[error("Cache serialization failed")]
Encode(#[from] rmp_serde::encode::Error),
#[error("Failed to parse pyvenv.cfg")] #[error("Failed to parse pyvenv.cfg")]
Cfg(#[from] cfg::Error), Cfg(#[from] cfg::Error),
} }