Improve path source dist caching (#578)

Path distribution cache reading errors are no longer fatal.

We now invalidate the path file source dists if its modification
timestamp changed, and invalidate path dir source dists if
`pyproject.toml` or alternatively `setup.py` changed, which seems good
choices since changing pyproject.toml should trigger a rebuild and the
user can `touch` the file as part of their workflow.

`CachedByTimestamp` is now a shared util. It doesn't have methods as i
don't think it's worth it yet for two users.

Closes #478

TODO(konstin): Write a test. This is probably twice as much work as that
fix itself, so i made that PR without one for now.
This commit is contained in:
konsti 2023-12-06 17:47:01 +01:00 committed by GitHub
parent 1bf754556f
commit 3f4d7b7826
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 90 additions and 36 deletions

View file

@ -22,5 +22,7 @@ directories = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] }
hex = { workspace = true }
seahash = { workspace = true }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
tempfile = { workspace = true }
url = { workspace = true }

View file

@ -0,0 +1,9 @@
use std::time::SystemTime;
use serde::{Deserialize, Serialize};
#[derive(Deserialize, Serialize)]
pub struct CachedByTimestamp<T> {
pub timestamp: SystemTime,
pub data: T,
}

View file

@ -7,6 +7,7 @@ use std::sync::Arc;
use fs_err as fs;
use tempfile::{tempdir, TempDir};
pub use by_timestamp::CachedByTimestamp;
pub use canonical_url::{CanonicalUrl, RepositoryUrl};
#[cfg(feature = "clap")]
pub use cli::CacheArgs;
@ -14,6 +15,7 @@ pub use digest::digest;
pub use stable_hash::{StableHash, StableHasher};
pub use wheel::WheelCache;
mod by_timestamp;
mod cache_key;
mod canonical_url;
mod cli;

View file

@ -14,7 +14,7 @@ use tempfile::TempDir;
use thiserror::Error;
use tokio::task::JoinError;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{debug, info_span};
use tracing::{debug, info_span, warn};
use url::Url;
use zip::result::ZipError;
use zip::ZipArchive;
@ -24,7 +24,7 @@ use distribution_types::direct_url::{DirectArchiveUrl, DirectGitUrl};
use distribution_types::{GitSourceDist, Metadata, PathSourceDist, RemoteSource, SourceDist};
use install_wheel_rs::read_dist_info;
use platform_tags::Tags;
use puffin_cache::{digest, CacheBucket, CacheEntry, CanonicalUrl, WheelCache};
use puffin_cache::{digest, CacheBucket, CacheEntry, CachedByTimestamp, CanonicalUrl, WheelCache};
use puffin_client::{CachedClient, CachedClientError, DataWithCachePolicy};
use puffin_fs::write_atomic;
use puffin_git::{Fetch, GitSource};
@ -73,6 +73,8 @@ pub enum SourceDistError {
DistInfo(#[from] install_wheel_rs::Error),
#[error("Failed to read zip archive from built wheel")]
Zip(#[from] ZipError),
#[error("Source distribution directory contains neither readable pyproject.toml nor setup.py")]
DirWithoutEntrypoint,
/// Should not occur; only seen when another task panicked.
#[error("The task executor is broken, did some other task panic?")]
@ -350,21 +352,60 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
METADATA_JSON.to_string(),
);
let mut metadatas = if cache_entry.path().is_file() {
let cached = fs::read(&cache_entry.path()).await?;
let metadatas = serde_json::from_slice::<Metadata21s>(&cached)?;
// Do we have previous compatible build of this source dist?
if let Some((filename, cached_data)) = metadatas
.iter()
.find(|(filename, _metadata)| filename.is_compatible(self.tags))
{
return Ok(BuiltWheelMetadata::from_cached(
filename,
cached_data,
&cache_entry,
));
let file_metadata = fs_err::metadata(&path_source_dist.path)?;
// `modified()` is infallible on windows and unix (i.e., all platforms we support).
let modified = if file_metadata.is_file() {
file_metadata.modified()?
} else {
if path_source_dist.path.join("pyproject.toml").is_file() {
path_source_dist
.path
.join("pyproject.toml")
.metadata()?
.modified()?
} else if path_source_dist.path.join("setup.py").is_file() {
path_source_dist
.path
.join("setup.py")
.metadata()?
.modified()?
} else {
return Err(SourceDistError::DirWithoutEntrypoint);
}
};
let mut metadatas = if cache_entry.path().is_file() {
let cached = fs::read(&cache_entry.path()).await.ok().and_then(|cached| {
serde_json::from_slice::<CachedByTimestamp<Metadata21s>>(&cached).ok()
});
if let Some(cached) = cached {
if cached.timestamp == modified {
// Do we have previous compatible build of this source dist?
if let Some((filename, cached_data)) = cached
.data
.iter()
.find(|(filename, _metadata)| filename.is_compatible(self.tags))
{
return Ok(BuiltWheelMetadata::from_cached(
filename,
cached_data,
&cache_entry,
));
}
cached.data
} else {
debug!(
"Removing stale built wheels for: {}",
cache_entry.path().display()
);
if let Err(err) = fs::remove_dir_all(&cache_entry.dir).await {
warn!("Failed to remove stale built wheel cache directory: {err}");
}
Metadata21s::default()
}
} else {
Metadata21s::default()
}
metadatas
} else {
Metadata21s::default()
};
@ -399,8 +440,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
metadata: metadata.clone(),
},
);
let cached = CachedByTimestamp {
timestamp: modified,
data: metadatas,
};
fs::create_dir_all(&cache_entry.dir).await?;
let data = serde_json::to_vec(&metadatas)?;
let data = serde_json::to_vec(&cached)?;
write_atomic(cache_entry.path(), data).await?;
if let Some(task) = task {

View file

@ -1,6 +1,5 @@
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::UNIX_EPOCH;
use fs_err as fs;
use serde::{Deserialize, Serialize};
@ -9,6 +8,7 @@ use tracing::debug;
use pep440_rs::Version;
use pep508_rs::MarkerEnvironment;
use platform_host::Platform;
use puffin_cache::CachedByTimestamp;
use puffin_cache::{digest, Cache, CacheBucket};
use puffin_fs::write_atomic_sync;
@ -106,12 +106,6 @@ pub(crate) struct InterpreterQueryResult {
pub(crate) sys_executable: PathBuf,
}
#[derive(Deserialize, Serialize)]
pub(crate) struct CachedByTimestamp<T> {
pub(crate) timestamp: u128,
pub(crate) data: T,
}
impl InterpreterQueryResult {
/// Return the resolved [`InterpreterQueryResult`] for the given Python executable.
pub(crate) fn query(interpreter: &Path) -> Result<Self, Error> {
@ -158,19 +152,19 @@ impl InterpreterQueryResult {
/// time as a cache key.
pub(crate) fn query_cached(executable: &Path, cache: &Cache) -> Result<Self, Error> {
let executable_bytes = executable.as_os_str().as_encoded_bytes();
let cache_dir = cache.bucket(CacheBucket::Interpreter);
let cache_path = cache_dir.join(format!("{}.json", digest(&executable_bytes)));
let cache_entry = cache.entry(
CacheBucket::Interpreter,
"",
format!("{}.json", digest(&executable_bytes)),
);
let modified = fs_err::metadata(executable)?
// Note: This is infallible on windows and unix (i.e., all platforms we support).
.modified()?
.duration_since(UNIX_EPOCH)
.map_err(|err| Error::SystemTime(executable.to_path_buf(), err))?;
// `modified()` is infallible on windows and unix (i.e., all platforms we support).
let modified = fs_err::metadata(executable)?.modified()?;
// Read from the cache.
if let Ok(data) = fs::read(&cache_path) {
if let Ok(data) = fs::read(cache_entry.path()) {
if let Ok(cached) = serde_json::from_slice::<CachedByTimestamp<Self>>(&data) {
if cached.timestamp == modified.as_millis() {
if cached.timestamp == modified {
debug!("Using cached markers for: {}", executable.display());
return Ok(cached.data);
}
@ -189,12 +183,12 @@ impl InterpreterQueryResult {
// If `executable` is a pyenv shim, a bash script that redirects to the activated
// python executable at another path, we're not allowed to cache the interpreter info
if executable == info.sys_executable {
fs::create_dir_all(cache_dir)?;
fs::create_dir_all(&cache_entry.dir)?;
// Write to the cache.
write_atomic_sync(
cache_path,
cache_entry.path(),
serde_json::to_vec(&CachedByTimestamp {
timestamp: modified.as_millis(),
timestamp: modified,
data: info.clone(),
})?,
)?;