Migrate interpreter query to custom caching (#508)

This removes the last usage of cacache by replacing it with a custom,
flat json caching keyed by the digest of the executable path.


![image](8f777c4c-1f1b-4656-ba7b-002175270556)

A step towards #478. I've made `CachedByTimestamp<T>` generic over `T`
but intentionally not moved it to `puffin-cache` yet.
This commit is contained in:
konsti 2023-11-28 18:14:59 +01:00 committed by GitHub
parent 5435d44756
commit d89fbeb642
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 88 additions and 156 deletions

100
Cargo.lock generated
View file

@ -391,31 +391,6 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc"
[[package]]
name = "cacache"
version = "12.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "142316461ed3a3dfcba10417317472da5bfd0461e4d276bf7c07b330766d9490"
dependencies = [
"digest",
"either",
"futures",
"hex",
"miette 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
"reflink-copy",
"serde",
"serde_derive",
"serde_json",
"sha1",
"sha2",
"ssri",
"tempfile",
"thiserror",
"tokio",
"tokio-stream",
"walkdir",
]
[[package]]
name = "camino"
version = "1.1.6"
@ -1188,6 +1163,7 @@ dependencies = [
"clap",
"fs-err",
"platform-host",
"puffin-cache",
"puffin-interpreter",
"serde",
"serde_json",
@ -1753,18 +1729,6 @@ dependencies = [
"autocfg",
]
[[package]]
name = "miette"
version = "5.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e"
dependencies = [
"miette-derive 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
"once_cell",
"thiserror",
"unicode-width",
]
[[package]]
name = "miette"
version = "5.10.0"
@ -1773,7 +1737,7 @@ dependencies = [
"backtrace",
"backtrace-ext",
"is-terminal",
"miette-derive 5.10.0 (git+https://github.com/zkat/miette.git?rev=fd77257cee0f5d03aa7dccb4ba8cbaa40c1a88c6)",
"miette-derive",
"once_cell",
"owo-colors",
"supports-color",
@ -1785,17 +1749,6 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "miette-derive"
version = "5.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.39",
]
[[package]]
name = "miette-derive"
version = "5.10.0"
@ -2338,7 +2291,6 @@ dependencies = [
"assert_cmd",
"assert_fs",
"bitflags 2.4.1",
"cacache",
"chrono",
"clap",
"colored",
@ -2351,7 +2303,7 @@ dependencies = [
"insta-cmd",
"install-wheel-rs",
"itertools 0.11.0",
"miette 5.10.0 (git+https://github.com/zkat/miette.git?rev=fd77257cee0f5d03aa7dccb4ba8cbaa40c1a88c6)",
"miette",
"mimalloc",
"pep440_rs 0.3.12",
"pep508_rs",
@ -2567,7 +2519,6 @@ dependencies = [
name = "puffin-interpreter"
version = "0.0.1"
dependencies = [
"cacache",
"fs-err",
"pep440_rs 0.3.12",
"pep508_rs",
@ -3262,28 +3213,6 @@ dependencies = [
"serde",
]
[[package]]
name = "sha-1"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c"
dependencies = [
"cfg-if 1.0.0",
"cpufeatures",
"digest",
]
[[package]]
name = "sha1"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
dependencies = [
"cfg-if 1.0.0",
"cpufeatures",
"digest",
]
[[package]]
name = "sha2"
version = "0.10.8"
@ -3363,23 +3292,6 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "ssri"
version = "9.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da7a2b3c2bc9693bcb40870c4e9b5bf0d79f9cb46273321bf855ec513e919082"
dependencies = [
"base64 0.21.5",
"digest",
"hex",
"miette 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde",
"sha-1",
"sha2",
"thiserror",
"xxhash-rust",
]
[[package]]
name = "stacker"
version = "0.1.15"
@ -4440,12 +4352,6 @@ dependencies = [
"libc",
]
[[package]]
name = "xxhash-rust"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9828b178da53440fa9c766a3d2f73f7cf5d0ac1fe3980c1e5018d899fd19e07b"
[[package]]
name = "yaml-rust"
version = "0.4.5"

View file

@ -18,7 +18,6 @@ async_http_range_reader = { git = "https://github.com/baszalmstra/async_http_ran
async_zip = { version = "0.0.15", features = ["tokio", "deflate"] }
bitflags = { version = "2.4.1" }
bytesize = { version = "1.3.0" }
cacache = { version = "12.0.0", default-features = false, features = ["tokio-runtime"] }
camino = { version = "1.1.6", features = ["serde1"] }
cargo-util = { version = "0.2.6" }
chrono = { version = "0.4.31" }

View file

@ -18,6 +18,7 @@ workspace = true
[dependencies]
platform-host = { path = "../platform-host" }
puffin-cache = { path = "../puffin-cache" }
puffin-interpreter = { path = "../puffin-interpreter" }
camino = { workspace = true }

View file

@ -11,6 +11,7 @@ use tracing_subscriber::{fmt, EnvFilter};
use gourgeist::{create_bare_venv, parse_python_cli};
use platform_host::Platform;
use puffin_cache::Cache;
use puffin_interpreter::Interpreter;
#[derive(Parser, Debug)]
@ -25,8 +26,8 @@ fn run() -> Result<(), gourgeist::Error> {
let location = cli.path.unwrap_or(Utf8PathBuf::from(".venv"));
let python = parse_python_cli(cli.python)?;
let platform = Platform::current()?;
let cache = tempfile::tempdir()?;
let info = Interpreter::query(python.as_std_path(), platform, cache.path()).unwrap();
let cache = Cache::temp()?;
let info = Interpreter::query(python.as_std_path(), platform, &cache).unwrap();
create_bare_venv(&location, &info)?;
Ok(())
}

View file

@ -150,6 +150,43 @@ pub enum CacheBucket {
/// Git repositories.
Git,
/// Information about an interpreter at a path.
///
/// To avoid caching pyenv shims, bash scripts which may redirect to a new python version
/// without the shim itself changing, we only cache when the path equals `sys.executable`, i.e.
/// the path we're running is the python executable itself and not a shim.
///
/// Cache structure: `interpreter-v0/<digest(path)>.json`
///
/// # Example
///
/// The contents of each of the json files has a timestamp field in unix time, the [PEP 508]
/// markers and some information from the `sys`/`sysconfig` modules.
///
/// ```json
/// {
/// "timestamp": 1698047994491,
/// "data": {
/// "markers": {
/// "implementation_name": "cpython",
/// "implementation_version": "3.12.0",
/// "os_name": "posix",
/// "platform_machine": "x86_64",
/// "platform_python_implementation": "CPython",
/// "platform_release": "6.5.0-13-generic",
/// "platform_system": "Linux",
/// "platform_version": "#13-Ubuntu SMP PREEMPT_DYNAMIC Fri Nov 3 12:16:05 UTC 2023",
/// "python_full_version": "3.12.0",
/// "python_version": "3.12",
/// "sys_platform": "linux"
/// },
/// "base_exec_prefix": "/home/ferris/.pyenv/versions/3.12.0",
/// "base_prefix": "/home/ferris/.pyenv/versions/3.12.0",
/// "sys_executable": "/home/ferris/projects/puffin/.venv/bin/python"
/// }
/// }
/// ```
///
/// [PEP 508]: https://peps.python.org/pep-0508/#environment-markers
Interpreter,
/// Index responses through the simple metadata API.
Simple,

View file

@ -39,7 +39,6 @@ requirements-txt = { path = "../requirements-txt" }
anstream = { workspace = true }
anyhow = { workspace = true }
bitflags = { workspace = true }
cacache = { workspace = true }
chrono = { workspace = true }
clap = { workspace = true, features = ["derive"] }
colored = { workspace = true }

View file

@ -8,7 +8,7 @@ use miette::{Diagnostic, IntoDiagnostic};
use thiserror::Error;
use platform_host::Platform;
use puffin_cache::{Cache, CacheBucket};
use puffin_cache::Cache;
use puffin_interpreter::Interpreter;
use crate::commands::ExitStatus;
@ -77,12 +77,8 @@ fn venv_impl(
};
let platform = Platform::current().into_diagnostic()?;
let interpreter_info = Interpreter::query(
&base_python,
platform,
&cache.bucket(CacheBucket::Interpreter),
)
.map_err(VenvError::InterpreterError)?;
let interpreter_info =
Interpreter::query(&base_python, platform, cache).map_err(VenvError::InterpreterError)?;
writeln!(
printer,

View file

@ -32,3 +32,4 @@ url = { workspace = true }
[dev-dependencies]
anyhow = { workspace = true }
tokio = { workspace = true, features = ["fs", "macros"] }

View file

@ -18,7 +18,6 @@ pep508_rs = { path = "../pep508-rs", features = ["serde"] }
platform-host = { path = "../platform-host" }
puffin-cache = { path = "../puffin-cache" }
cacache = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] }
serde_json = { workspace = true }
thiserror = { workspace = true }

View file

@ -2,12 +2,14 @@ use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::UNIX_EPOCH;
use fs_err as fs;
use serde::{Deserialize, Serialize};
use tracing::debug;
use pep440_rs::Version;
use pep508_rs::MarkerEnvironment;
use platform_host::Platform;
use puffin_cache::{digest, Cache, CacheBucket};
use crate::python_platform::PythonPlatform;
use crate::Error;
@ -24,7 +26,7 @@ pub struct Interpreter {
impl Interpreter {
/// Detect the interpreter info for the given Python executable.
pub fn query(executable: &Path, platform: Platform, cache: &Path) -> Result<Self, Error> {
pub fn query(executable: &Path, platform: Platform, cache: &Cache) -> Result<Self, Error> {
let info = InterpreterQueryResult::query_cached(executable, cache)?;
debug_assert!(
info.base_prefix == info.base_exec_prefix,
@ -101,7 +103,7 @@ impl Interpreter {
}
}
#[derive(Deserialize, Serialize)]
#[derive(Deserialize, Serialize, Clone)]
pub(crate) struct InterpreterQueryResult {
pub(crate) markers: MarkerEnvironment,
pub(crate) base_exec_prefix: PathBuf,
@ -109,6 +111,12 @@ pub(crate) struct InterpreterQueryResult {
pub(crate) sys_executable: PathBuf,
}
#[derive(Deserialize, Serialize)]
pub(crate) struct CachedByTimestamp<T> {
pub(crate) timestamp: u128,
pub(crate) data: T,
}
impl InterpreterQueryResult {
/// Return the resolved [`InterpreterQueryResult`] for the given Python executable.
pub(crate) fn query(interpreter: &Path) -> Result<Self, Error> {
@ -153,48 +161,43 @@ impl InterpreterQueryResult {
/// Running a Python script is (relatively) expensive, and the markers won't change
/// unless the Python executable changes, so we use the executable's last modified
/// time as a cache key.
pub(crate) fn query_cached(executable: &Path, cache: &Path) -> Result<Self, Error> {
pub(crate) fn query_cached(executable: &Path, cache: &Cache) -> Result<Self, Error> {
let executable_bytes = executable.as_os_str().as_encoded_bytes();
let cache_dir = cache.bucket(CacheBucket::Interpreter);
let cache_path = cache_dir.join(format!("{}.json", digest(&executable_bytes)));
// Read from the cache.
let key = if let Ok(key) = cache_key(executable) {
if let Ok(data) = cacache::read_sync(cache, &key) {
if let Ok(info) = serde_json::from_slice::<Self>(&data) {
debug!("Using cached markers for {}", executable.display());
return Ok(info);
}
if let Ok(data) = fs::read(&cache_path) {
if let Ok(cached) = serde_json::from_slice::<CachedByTimestamp<Self>>(&data) {
debug!("Using cached markers for {}", executable.display());
return Ok(cached.data);
}
Some(key)
} else {
None
};
}
// Otherwise, run the Python script.
debug!("Detecting markers for {}", executable.display());
let info = Self::query(executable)?;
// If the executable is actually a pyenv shim, a bash script that redirects to the activated
// python, we're not allowed to cache the interpreter info
let modified = fs_err::metadata(executable)?
// Note: This is infallible on windows and unix (i.e. all platforms we support)
.modified()?
.duration_since(UNIX_EPOCH)
.map_err(|err| Error::SystemTime(executable.to_path_buf(), err))?;
// If `executable` is a pyenv shim, a bash script that redirects to the activated
// python executable at another path, we're not allowed to cache the interpreter info
if executable == info.sys_executable {
fs::create_dir_all(cache_dir)?;
// Write to the cache.
if let Some(key) = key {
cacache::write_sync(cache, key, serde_json::to_vec(&info)?)?;
}
fs::write(
cache_path,
serde_json::to_vec(&CachedByTimestamp {
timestamp: modified.as_millis(),
data: info.clone(),
})?,
)?;
}
Ok(info)
}
}
/// Create a cache key for the Python executable, consisting of the executable's
/// last modified time and the executable's path.
fn cache_key(executable: &Path) -> Result<String, Error> {
let modified = fs_err::metadata(executable)?
// Note: This is infallible on windows and unix (i.e. all platforms we support)
.modified()?
.duration_since(UNIX_EPOCH)
.map_err(|err| Error::SystemTime(executable.to_path_buf(), err))?;
Ok(format!(
"puffin:v0:{}:{}",
executable.display(),
modified.as_millis()
))
}

View file

@ -38,7 +38,5 @@ pub enum Error {
stderr: String,
},
#[error("Failed to write to cache")]
Cacache(#[from] cacache::Error),
#[error("Failed to write to cache")]
Serde(#[from] serde_json::Error),
}

View file

@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
use tracing::debug;
use platform_host::Platform;
use puffin_cache::{Cache, CacheBucket};
use puffin_cache::Cache;
use crate::python_platform::PythonPlatform;
use crate::{Error, Interpreter};
@ -24,11 +24,7 @@ impl Virtualenv {
return Err(Error::NotFound);
};
let executable = platform.venv_python(&venv);
let interpreter = Interpreter::query(
&executable,
platform.0,
&cache.bucket(CacheBucket::Interpreter),
)?;
let interpreter = Interpreter::query(&executable, platform.0, cache)?;
Ok(Self {
root: venv,
@ -39,11 +35,7 @@ impl Virtualenv {
pub fn from_virtualenv(platform: Platform, root: &Path, cache: &Cache) -> Result<Self, Error> {
let platform = PythonPlatform::from(platform);
let executable = platform.venv_python(root);
let interpreter = Interpreter::query(
&executable,
platform.0,
&cache.bucket(CacheBucket::Interpreter),
)?;
let interpreter = Interpreter::query(&executable, platform.0, cache)?;
Ok(Self {
root: root.to_path_buf(),

View file

@ -49,7 +49,7 @@ serde_json = { workspace = true }
sha2 = { workspace = true }
tempfile = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio = { workspace = true, features = ["macros"] }
tokio-util = { workspace = true, features = ["compat"] }
tracing = { workspace = true }
url = { workspace = true }