Switch to msgpack in the cached client (#662)

This gives a 1.23 speedup on transformers-extras. We could change to
msgpack for the entire cache if we want. I only tried this format and
postcard so far, where postcard was much slower (like 1.6s).

I don't actually want to merge it like this, i wanted to figure out the
ballpark of improvement for switching away from json.

```
hyperfine --warmup 3 --runs 10 "target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in" "target/profiling/branch pip-compile scripts/requirements/transformers-extras.in"
Benchmark 1: target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in
  Time (mean ± σ):     179.1 ms ±   4.8 ms    [User: 157.5 ms, System: 48.1 ms]
  Range (min … max):   174.9 ms … 188.1 ms    10 runs

Benchmark 2: target/profiling/branch pip-compile scripts/requirements/transformers-extras.in
  Time (mean ± σ):     221.1 ms ±   6.7 ms    [User: 208.1 ms, System: 46.5 ms]
  Range (min … max):   213.5 ms … 235.5 ms    10 runs

Summary
  target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in ran
    1.23 ± 0.05 times faster than target/profiling/branch pip-compile scripts/requirements/transformers-extras.in
```

Disadvantage: We can't manually look into the cache anymore to debug
things

- [ ] Check more formats, i currently only tested json, msgpack and
postcard, there should be other formats, too
- [x] Switch over `CachedByTimestamp` serialization (for the interpreter
caching)
- [x] Switch over error handling and make sure puffin is still resilient
to cache failure
This commit is contained in:
konsti 2023-12-16 22:01:35 +01:00 committed by GitHub
parent e4673a0c52
commit 71964ec7a8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 117 additions and 66 deletions

View file

@ -56,8 +56,10 @@ pub enum SourceDistError {
// Cache writing error
#[error("Failed to write to source dist cache")]
Io(#[from] std::io::Error),
#[error("Cache (de)serialization failed")]
Serde(#[from] serde_json::Error),
#[error("Cache deserialization failed")]
Decode(#[from] rmp_serde::decode::Error),
#[error("Cache serialization failed")]
Encode(#[from] rmp_serde::encode::Error),
// Build error
#[error("Failed to build: {0}")]
@ -179,7 +181,8 @@ pub struct SourceDistCachedBuilder<'a, T: BuildContext> {
tags: &'a Tags,
}
const METADATA_JSON: &str = "metadata.json";
/// The name of the file that contains the cached metadata, encoded via `MsgPack`.
const METADATA: &str = "metadata.msgpack";
impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
/// Initialize a [`SourceDistCachedBuilder`] from a [`BuildContext`].
@ -268,7 +271,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
cache_shard: &CacheShard,
subdirectory: Option<&'data Path>,
) -> Result<BuiltWheelMetadata, SourceDistError> {
let cache_entry = cache_shard.entry(METADATA_JSON.to_string());
let cache_entry = cache_shard.entry(METADATA.to_string());
let response_callback = |response| async {
// At this point, we're seeing a new or updated source distribution; delete all
@ -368,12 +371,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
if let Ok(cached) = fs::read(cache_entry.path()).await {
// If the file exists and it was just read or written by `CachedClient`, we assume it must
// be correct.
let mut cached = serde_json::from_slice::<DataWithCachePolicy<Manifest>>(&cached)?;
let mut cached = rmp_serde::from_slice::<DataWithCachePolicy<Manifest>>(&cached)?;
cached
.data
.insert(wheel_filename.clone(), cached_data.clone());
write_atomic(cache_entry.path(), serde_json::to_vec(&cached)?).await?;
write_atomic(cache_entry.path(), rmp_serde::to_vec(&cached)?).await?;
};
Ok(BuiltWheelMetadata::from_cached(
@ -393,7 +396,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
CacheBucket::BuiltWheels,
WheelCache::Path(&path_source_dist.url)
.remote_wheel_dir(path_source_dist.name().as_ref()),
METADATA_JSON.to_string(),
METADATA.to_string(),
);
// Determine the last-modified time of the source distribution.
@ -464,7 +467,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
timestamp: modified,
data: manifest,
};
let data = serde_json::to_vec(&cached)?;
let data = rmp_serde::to_vec(&cached)?;
write_atomic(cache_entry.path(), data).await?;
if let Some(task) = task {
@ -498,7 +501,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
CacheBucket::BuiltWheels,
WheelCache::Git(&git_source_dist.url, &git_sha.to_short_string())
.remote_wheel_dir(git_source_dist.name().as_ref()),
METADATA_JSON.to_string(),
METADATA.to_string(),
);
// Read the existing metadata from the cache.
@ -540,7 +543,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
metadata: metadata.clone(),
},
);
let data = serde_json::to_vec(&manifest)?;
let data = rmp_serde::to_vec(&manifest)?;
write_atomic(cache_entry.path(), data).await?;
if let Some(task) = task {
@ -707,7 +710,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
) -> Result<Option<Manifest>, SourceDistError> {
match fs::read(&cache_entry.path()).await {
Ok(cached) => {
let cached = serde_json::from_slice::<CachedByTimestamp<Manifest>>(&cached)?;
let cached = rmp_serde::from_slice::<CachedByTimestamp<Manifest>>(&cached)?;
if cached.timestamp == modified {
Ok(Some(cached.data))
} else {
@ -729,7 +732,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
/// Read an existing cache entry, if it exists.
async fn read_metadata(cache_entry: &CacheEntry) -> Result<Option<Manifest>, SourceDistError> {
match fs::read(&cache_entry.path()).await {
Ok(cached) => Ok(Some(serde_json::from_slice::<Manifest>(&cached)?)),
Ok(cached) => Ok(Some(rmp_serde::from_slice::<Manifest>(&cached)?)),
Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(err) => Err(err.into()),
}