Migrate interpreter query to custom caching (#508)

This removes the last usage of cacache by replacing it with a custom,
flat json caching keyed by the digest of the executable path.


![image](8f777c4c-1f1b-4656-ba7b-002175270556)

A step towards #478. I've made `CachedByTimestamp<T>` generic over `T`
but intentionally not moved it to `puffin-cache` yet.
This commit is contained in:
konsti 2023-11-28 18:14:59 +01:00 committed by GitHub
parent 5435d44756
commit d89fbeb642
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 88 additions and 156 deletions

View file

@ -18,6 +18,7 @@ workspace = true
[dependencies]
platform-host = { path = "../platform-host" }
puffin-cache = { path = "../puffin-cache" }
puffin-interpreter = { path = "../puffin-interpreter" }
camino = { workspace = true }

View file

@ -11,6 +11,7 @@ use tracing_subscriber::{fmt, EnvFilter};
use gourgeist::{create_bare_venv, parse_python_cli};
use platform_host::Platform;
use puffin_cache::Cache;
use puffin_interpreter::Interpreter;
#[derive(Parser, Debug)]
@ -25,8 +26,8 @@ fn run() -> Result<(), gourgeist::Error> {
let location = cli.path.unwrap_or(Utf8PathBuf::from(".venv"));
let python = parse_python_cli(cli.python)?;
let platform = Platform::current()?;
let cache = tempfile::tempdir()?;
let info = Interpreter::query(python.as_std_path(), platform, cache.path()).unwrap();
let cache = Cache::temp()?;
let info = Interpreter::query(python.as_std_path(), platform, &cache).unwrap();
create_bare_venv(&location, &info)?;
Ok(())
}

View file

@ -150,6 +150,43 @@ pub enum CacheBucket {
/// Git repositories.
Git,
/// Information about an interpreter at a path.
///
/// To avoid caching pyenv shims, bash scripts which may redirect to a new python version
/// without the shim itself changing, we only cache when the path equals `sys.executable`, i.e.
/// the path we're running is the python executable itself and not a shim.
///
/// Cache structure: `interpreter-v0/<digest(path)>.json`
///
/// # Example
///
/// The contents of each of the json files has a timestamp field in unix time, the [PEP 508]
/// markers and some information from the `sys`/`sysconfig` modules.
///
/// ```json
/// {
/// "timestamp": 1698047994491,
/// "data": {
/// "markers": {
/// "implementation_name": "cpython",
/// "implementation_version": "3.12.0",
/// "os_name": "posix",
/// "platform_machine": "x86_64",
/// "platform_python_implementation": "CPython",
/// "platform_release": "6.5.0-13-generic",
/// "platform_system": "Linux",
/// "platform_version": "#13-Ubuntu SMP PREEMPT_DYNAMIC Fri Nov 3 12:16:05 UTC 2023",
/// "python_full_version": "3.12.0",
/// "python_version": "3.12",
/// "sys_platform": "linux"
/// },
/// "base_exec_prefix": "/home/ferris/.pyenv/versions/3.12.0",
/// "base_prefix": "/home/ferris/.pyenv/versions/3.12.0",
/// "sys_executable": "/home/ferris/projects/puffin/.venv/bin/python"
/// }
/// }
/// ```
///
/// [PEP 508]: https://peps.python.org/pep-0508/#environment-markers
Interpreter,
/// Index responses through the simple metadata API.
Simple,

View file

@ -39,7 +39,6 @@ requirements-txt = { path = "../requirements-txt" }
anstream = { workspace = true }
anyhow = { workspace = true }
bitflags = { workspace = true }
cacache = { workspace = true }
chrono = { workspace = true }
clap = { workspace = true, features = ["derive"] }
colored = { workspace = true }

View file

@ -8,7 +8,7 @@ use miette::{Diagnostic, IntoDiagnostic};
use thiserror::Error;
use platform_host::Platform;
use puffin_cache::{Cache, CacheBucket};
use puffin_cache::Cache;
use puffin_interpreter::Interpreter;
use crate::commands::ExitStatus;
@ -77,12 +77,8 @@ fn venv_impl(
};
let platform = Platform::current().into_diagnostic()?;
let interpreter_info = Interpreter::query(
&base_python,
platform,
&cache.bucket(CacheBucket::Interpreter),
)
.map_err(VenvError::InterpreterError)?;
let interpreter_info =
Interpreter::query(&base_python, platform, cache).map_err(VenvError::InterpreterError)?;
writeln!(
printer,

View file

@ -32,3 +32,4 @@ url = { workspace = true }
[dev-dependencies]
anyhow = { workspace = true }
tokio = { workspace = true, features = ["fs", "macros"] }

View file

@ -18,7 +18,6 @@ pep508_rs = { path = "../pep508-rs", features = ["serde"] }
platform-host = { path = "../platform-host" }
puffin-cache = { path = "../puffin-cache" }
cacache = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] }
serde_json = { workspace = true }
thiserror = { workspace = true }

View file

@ -2,12 +2,14 @@ use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::UNIX_EPOCH;
use fs_err as fs;
use serde::{Deserialize, Serialize};
use tracing::debug;
use pep440_rs::Version;
use pep508_rs::MarkerEnvironment;
use platform_host::Platform;
use puffin_cache::{digest, Cache, CacheBucket};
use crate::python_platform::PythonPlatform;
use crate::Error;
@ -24,7 +26,7 @@ pub struct Interpreter {
impl Interpreter {
/// Detect the interpreter info for the given Python executable.
pub fn query(executable: &Path, platform: Platform, cache: &Path) -> Result<Self, Error> {
pub fn query(executable: &Path, platform: Platform, cache: &Cache) -> Result<Self, Error> {
let info = InterpreterQueryResult::query_cached(executable, cache)?;
debug_assert!(
info.base_prefix == info.base_exec_prefix,
@ -101,7 +103,7 @@ impl Interpreter {
}
}
#[derive(Deserialize, Serialize)]
#[derive(Deserialize, Serialize, Clone)]
pub(crate) struct InterpreterQueryResult {
pub(crate) markers: MarkerEnvironment,
pub(crate) base_exec_prefix: PathBuf,
@ -109,6 +111,12 @@ pub(crate) struct InterpreterQueryResult {
pub(crate) sys_executable: PathBuf,
}
#[derive(Deserialize, Serialize)]
pub(crate) struct CachedByTimestamp<T> {
pub(crate) timestamp: u128,
pub(crate) data: T,
}
impl InterpreterQueryResult {
/// Return the resolved [`InterpreterQueryResult`] for the given Python executable.
pub(crate) fn query(interpreter: &Path) -> Result<Self, Error> {
@ -153,48 +161,43 @@ impl InterpreterQueryResult {
/// Running a Python script is (relatively) expensive, and the markers won't change
/// unless the Python executable changes, so we use the executable's last modified
/// time as a cache key.
pub(crate) fn query_cached(executable: &Path, cache: &Path) -> Result<Self, Error> {
pub(crate) fn query_cached(executable: &Path, cache: &Cache) -> Result<Self, Error> {
let executable_bytes = executable.as_os_str().as_encoded_bytes();
let cache_dir = cache.bucket(CacheBucket::Interpreter);
let cache_path = cache_dir.join(format!("{}.json", digest(&executable_bytes)));
// Read from the cache.
let key = if let Ok(key) = cache_key(executable) {
if let Ok(data) = cacache::read_sync(cache, &key) {
if let Ok(info) = serde_json::from_slice::<Self>(&data) {
debug!("Using cached markers for {}", executable.display());
return Ok(info);
}
if let Ok(data) = fs::read(&cache_path) {
if let Ok(cached) = serde_json::from_slice::<CachedByTimestamp<Self>>(&data) {
debug!("Using cached markers for {}", executable.display());
return Ok(cached.data);
}
Some(key)
} else {
None
};
}
// Otherwise, run the Python script.
debug!("Detecting markers for {}", executable.display());
let info = Self::query(executable)?;
// If the executable is actually a pyenv shim, a bash script that redirects to the activated
// python, we're not allowed to cache the interpreter info
let modified = fs_err::metadata(executable)?
// Note: This is infallible on windows and unix (i.e. all platforms we support)
.modified()?
.duration_since(UNIX_EPOCH)
.map_err(|err| Error::SystemTime(executable.to_path_buf(), err))?;
// If `executable` is a pyenv shim, a bash script that redirects to the activated
// python executable at another path, we're not allowed to cache the interpreter info
if executable == info.sys_executable {
fs::create_dir_all(cache_dir)?;
// Write to the cache.
if let Some(key) = key {
cacache::write_sync(cache, key, serde_json::to_vec(&info)?)?;
}
fs::write(
cache_path,
serde_json::to_vec(&CachedByTimestamp {
timestamp: modified.as_millis(),
data: info.clone(),
})?,
)?;
}
Ok(info)
}
}
/// Create a cache key for the Python executable, consisting of the executable's
/// last modified time and the executable's path.
fn cache_key(executable: &Path) -> Result<String, Error> {
let modified = fs_err::metadata(executable)?
// Note: This is infallible on windows and unix (i.e. all platforms we support)
.modified()?
.duration_since(UNIX_EPOCH)
.map_err(|err| Error::SystemTime(executable.to_path_buf(), err))?;
Ok(format!(
"puffin:v0:{}:{}",
executable.display(),
modified.as_millis()
))
}

View file

@ -38,7 +38,5 @@ pub enum Error {
stderr: String,
},
#[error("Failed to write to cache")]
Cacache(#[from] cacache::Error),
#[error("Failed to write to cache")]
Serde(#[from] serde_json::Error),
}

View file

@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
use tracing::debug;
use platform_host::Platform;
use puffin_cache::{Cache, CacheBucket};
use puffin_cache::Cache;
use crate::python_platform::PythonPlatform;
use crate::{Error, Interpreter};
@ -24,11 +24,7 @@ impl Virtualenv {
return Err(Error::NotFound);
};
let executable = platform.venv_python(&venv);
let interpreter = Interpreter::query(
&executable,
platform.0,
&cache.bucket(CacheBucket::Interpreter),
)?;
let interpreter = Interpreter::query(&executable, platform.0, cache)?;
Ok(Self {
root: venv,
@ -39,11 +35,7 @@ impl Virtualenv {
pub fn from_virtualenv(platform: Platform, root: &Path, cache: &Cache) -> Result<Self, Error> {
let platform = PythonPlatform::from(platform);
let executable = platform.venv_python(root);
let interpreter = Interpreter::query(
&executable,
platform.0,
&cache.bucket(CacheBucket::Interpreter),
)?;
let interpreter = Interpreter::query(&executable, platform.0, cache)?;
Ok(Self {
root: root.to_path_buf(),

View file

@ -49,7 +49,7 @@ serde_json = { workspace = true }
sha2 = { workspace = true }
tempfile = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio = { workspace = true, features = ["macros"] }
tokio-util = { workspace = true, features = ["compat"] }
tracing = { workspace = true }
url = { workspace = true }