From d4b4c21133917870afc8418db039ae9075d002d5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 5 Feb 2024 16:47:53 -0500 Subject: [PATCH] initial implementation of zero-copy deserialization for SimpleMetadata (#1249) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (Please review this PR commit by commit.) This PR closes an initial loop on zero-copy deserialization. That is, provides a way to get a `Archived` (spelled `OwnedArchive` in the code) from a `CachedClient`. The main benefit of zero-copy deserialization is that we can read bytes from a file, cast those bytes to a structured representation without cost, and then start using that type as any other Rust type. The "catch" is that the structured representation is not the actual type you started with, but the "archived" version of it. In order to make all this work, we ended up needing to shave a rather large yak: we had to re-implement HTTP cache semantics. Previously, we were using the `http-cache-semantics` crate. While it does support Serde, it doesn't support `rkyv`. Moreover, even simple support for `rkyv` wouldn't be enough. What we actually want is for the HTTP cache semantics to be implemented on the *archived* type so that we can decide whether our cached response is stale or not without needing to do a full deserialization into the unarchived type. This is why, in this PR, you'll see `impl ArchivedCachePolicy { ... }` instead of `impl CachePolicy { ... }`. (The `derive(rkyv::Archive)` macro automatically introduces the `ArchivedCachePolicy` type into the current namespace.) Unfortunately, this PR does not fully realize the dream that is zero-copy deserialization. Namely, while a `CachedClient` can now provide an `OwnedArchive`, the rest of our code doesn't really make use of it. Indeed, as soon as we go to build a `VersionMap`, we eagerly convert our archived metadata into an owned `SimpleMetadata` via deserialization (that *isn't* zero-copy). After this change, a lot of the work now shifts to `rkyv` deserialization and `VersionMap` construction. More precisely, the main thing we drop here is `CachePolicy` deserialization (which is now truly zero-copy) and the parsing of the MessagePack format for `SimpleMetadata`. But we are still paying for deserialization. We're just paying for it in a different place. This PR does seem to bring a speed-up, but it is somewhat underwhelming. My measurements have been pretty noisy, but I get a 1.1x speedup fairly often: ``` $ hyperfine -w5 "puffin-main pip compile --cache-dir ~/astral/tmp/cache-main ~/astral/tmp/reqs/home-assistant-reduced.in -o /dev/null" "puffin-test pip compile --cache-dir ~/astral/tmp/cache-test ~/astral/tmp/reqs/home-assistant-reduced.in -o /dev/null" ; A kang Benchmark 1: puffin-main pip compile --cache-dir ~/astral/tmp/cache-main ~/astral/tmp/reqs/home-assistant-reduced.in -o /dev/null Time (mean ± σ): 164.4 ms ± 18.8 ms [User: 427.1 ms, System: 348.6 ms] Range (min … max): 131.1 ms … 190.5 ms 18 runs Benchmark 2: puffin-test pip compile --cache-dir ~/astral/tmp/cache-test ~/astral/tmp/reqs/home-assistant-reduced.in -o /dev/null Time (mean ± σ): 148.3 ms ± 10.2 ms [User: 357.1 ms, System: 319.4 ms] Range (min … max): 136.8 ms … 184.4 ms 19 runs Summary puffin-test pip compile --cache-dir ~/astral/tmp/cache-test ~/astral/tmp/reqs/home-assistant-reduced.in -o /dev/null ran 1.11 ± 0.15 times faster than puffin-main pip compile --cache-dir ~/astral/tmp/cache-main ~/astral/tmp/reqs/home-assistant-reduced.in -o /dev/null ``` One downside is that this does increase cache size (`rkyv`'s serialization format is not as compact as MessagePack). On disk size increases by about 1.8x for our `simple-v0` cache. ``` $ sort-filesize cache-main 4.0K cache-main/CACHEDIR.TAG 4.0K cache-main/.gitignore 8.0K cache-main/interpreter-v0 8.7M cache-main/wheels-v0 18M cache-main/archive-v0 59M cache-main/simple-v0 109M cache-main/built-wheels-v0 193M cache-main 193M total $ sort-filesize cache-test 4.0K cache-test/CACHEDIR.TAG 4.0K cache-test/.gitignore 8.0K cache-test/interpreter-v0 8.7M cache-test/wheels-v0 18M cache-test/archive-v0 107M cache-test/simple-v0 109M cache-test/built-wheels-v0 242M cache-test 242M total ``` Also, while I initially intended to do a simplistic implementation of HTTP cache semantics, I found that everything was somewhat inter-connected. I could have wrote code that _specifically_ only worked with the present behavior of PyPI, but then it would need to be special cased and everything else would need to continue to use `http-cache-sematics`. By implementing what we need based on what Puffin actually is (which is still less than what `http-cache-semantics` does), we can avoid special casing and use zero-copy deserialization for our cache policy in _all_ cases. --- Cargo.lock | 24 - Cargo.toml | 1 - crates/puffin-cache/src/lib.rs | 12 +- crates/puffin-client/Cargo.toml | 1 - crates/puffin-client/src/cache_headers.rs | 56 - crates/puffin-client/src/cached_client.rs | 898 +++++++---- crates/puffin-client/src/flat_index.rs | 2 +- crates/puffin-client/src/httpcache/control.rs | 775 ++++++++++ crates/puffin-client/src/httpcache/mod.rs | 1358 +++++++++++++++++ crates/puffin-client/src/lib.rs | 2 +- crates/puffin-client/src/registry_client.rs | 22 +- crates/puffin-dev/src/resolve_many.rs | 5 +- .../src/distribution_database.rs | 4 +- crates/puffin-distribution/src/source/mod.rs | 13 +- crates/puffin-resolver/Cargo.toml | 1 - crates/puffin-resolver/src/finder.rs | 5 +- crates/puffin-resolver/src/version_map.rs | 13 +- 17 files changed, 2763 insertions(+), 429 deletions(-) delete mode 100644 crates/puffin-client/src/cache_headers.rs create mode 100644 crates/puffin-client/src/httpcache/control.rs create mode 100644 crates/puffin-client/src/httpcache/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 00452ae51..d1a32d60d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1361,34 +1361,12 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "http-cache-semantics" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aec9f678bca3f4a15194b980f20ed9bfe0dd38e8d298c65c559a93dfbd6380a" -dependencies = [ - "http", - "http-serde", - "serde", - "time", -] - [[package]] name = "http-content-range" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f0d1a8ef218a86416107794b34cc446958d9203556c312bb41eab4c924c1d2e" -[[package]] -name = "http-serde" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f560b665ad9f1572cfcaf034f7fb84338a7ce945216d64a90fd81f046a3caee" -dependencies = [ - "http", - "serde", -] - [[package]] name = "httparse" version = "1.8.0" @@ -2584,7 +2562,6 @@ dependencies = [ "futures", "html-escape", "http", - "http-cache-semantics", "insta", "install-wheel-rs", "pep440_rs", @@ -2868,7 +2845,6 @@ dependencies = [ "fs-err", "futures", "gourgeist", - "http-cache-semantics", "indexmap 2.2.2", "insta", "install-wheel-rs", diff --git a/Cargo.toml b/Cargo.toml index 33f4b8b02..7c5538f25 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,7 +50,6 @@ hmac = { version = "0.12.1" } home = { version = "0.5.9" } html-escape = { version = "0.2.13" } http = { version = "0.2.11" } -http-cache-semantics = { version = "1.0.2" } indexmap = { version = "2.1.0" } indicatif = { version = "0.17.7" } indoc = { version = "2.0.4" } diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index d453c3858..65f9837a3 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -402,7 +402,7 @@ pub enum CacheBucket { /// The following requirements: /// ```text /// # git source dist - /// pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git + /// pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git /// # pypi source dist /// django_allauth==0.51.0 /// # url source dist @@ -488,8 +488,8 @@ pub enum CacheBucket { /// Index responses through the simple metadata API. /// /// Cache structure: - /// * `simple-v0/pypi/.msgpack` - /// * `simple-v0//.msgpack` + /// * `simple-v0/pypi/.rkyv` + /// * `simple-v0//.rkyv` /// /// The response is parsed into `puffin_client::SimpleMetadata` before storage. Simple, @@ -575,15 +575,15 @@ impl CacheBucket { } } CacheBucket::Simple => { - // For `pypi` wheels, we expect a MsgPack file per package, indexed by name. + // For `pypi` wheels, we expect a rkyv file per package, indexed by name. let root = cache.bucket(self).join(WheelCacheKind::Pypi); - summary += rm_rf(root.join(format!("{name}.msgpack")))?; + summary += rm_rf(root.join(format!("{name}.rkyv")))?; // For alternate indices, we expect a directory for every index, followed by a // MsgPack file per package, indexed by name. let root = cache.bucket(self).join(WheelCacheKind::Url); for directory in directories(root) { - summary += rm_rf(directory.join(format!("{name}.msgpack")))?; + summary += rm_rf(directory.join(format!("{name}.rkyv")))?; } } CacheBucket::FlatIndex => { diff --git a/crates/puffin-client/Cargo.toml b/crates/puffin-client/Cargo.toml index 1b5b3e686..b26015ceb 100644 --- a/crates/puffin-client/Cargo.toml +++ b/crates/puffin-client/Cargo.toml @@ -23,7 +23,6 @@ fs-err = { workspace = true, features = ["tokio"] } futures = { workspace = true } html-escape = { workspace = true } http = { workspace = true } -http-cache-semantics = { workspace = true } reqwest = { workspace = true } reqwest-middleware = { workspace = true } reqwest-retry = { workspace = true } diff --git a/crates/puffin-client/src/cache_headers.rs b/crates/puffin-client/src/cache_headers.rs deleted file mode 100644 index 3728a6ed6..000000000 --- a/crates/puffin-client/src/cache_headers.rs +++ /dev/null @@ -1,56 +0,0 @@ -use http::HeaderValue; -use rustc_hash::FxHashMap; -use std::collections::hash_map::Entry; - -/// Cache headers from an HTTP response. -#[derive(Debug, Default)] -pub(crate) struct CacheHeaders(FxHashMap, Option>>); - -impl CacheHeaders { - /// Parse the `Cache-Control` header from an HTTP response. - /// - /// See: - pub(crate) fn from_response<'a>( - headers: impl IntoIterator, - ) -> CacheHeaders { - let mut cc = FxHashMap::, Option>>::default(); - let mut is_valid = true; - - for h in headers.into_iter().filter_map(|v| v.to_str().ok()) { - for part in h.split(',') { - // TODO: lame parsing - if part.trim().is_empty() { - continue; - } - let mut kv = part.splitn(2, '='); - let k = kv.next().unwrap().trim(); - if k.is_empty() { - continue; - } - let v = kv.next().map(str::trim); - match cc.entry(k.into()) { - Entry::Occupied(e) => { - // When there is more than one value present for a given directive (e.g., two Expires header fields, multiple Cache-Control: max-age directives), - // the directive's value is considered invalid. Caches are encouraged to consider responses that have invalid freshness information to be stale - if e.get().as_deref() != v { - is_valid = false; - } - } - Entry::Vacant(e) => { - e.insert(v.map(|v| v.trim_matches('"')).map(From::from)); - // TODO: bad unquoting - } - } - } - } - if !is_valid { - cc.insert("must-revalidate".into(), None); - } - Self(cc) - } - - /// Returns `true` if the response has an `immutable` directive. - pub(crate) fn is_immutable(&self) -> bool { - self.0.contains_key("immutable") - } -} diff --git a/crates/puffin-client/src/cached_client.rs b/crates/puffin-client/src/cached_client.rs index 4c82ae31b..b4420ac82 100644 --- a/crates/puffin-client/src/cached_client.rs +++ b/crates/puffin-client/src/cached_client.rs @@ -1,20 +1,98 @@ -use std::future::Future; -use std::time::SystemTime; +use std::{borrow::Cow, future::Future, path::Path}; use futures::FutureExt; -use http::request::Parts; -use http_cache_semantics::{AfterResponse, BeforeRequest, CachePolicy}; use reqwest::{Request, Response}; use reqwest_middleware::ClientWithMiddleware; +use rkyv::util::AlignedVec; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tracing::{debug, info_span, instrument, trace, warn, Instrument}; -use url::Url; use puffin_cache::{CacheEntry, Freshness}; use puffin_fs::write_atomic; -use crate::{cache_headers::CacheHeaders, Error, ErrorKind}; +use crate::{ + httpcache::{AfterResponse, BeforeRequest, CachePolicy, CachePolicyBuilder}, + rkyvutil::OwnedArchive, + Error, ErrorKind, +}; + +/// A trait the generalizes (de)serialization at a high level. +/// +/// The main purpose of this trait is to make the `CachedClient` work for +/// either serde or other mechanisms of serialization such as `rkyv`. +/// +/// If you're using Serde, then unless you want to control the format, callers +/// should just use `CachedClient::get_serde`. This will use a default +/// implementation of `Cacheable` internally. +/// +/// Alternatively, callers using `rkyv` should use +/// `CachedClient::get_cacheable`. If your types fit into the +/// `rkyvutil::OwnedArchive` mold, then an implementation of `Cacheable` is +/// already provided for that type. +pub trait Cacheable: Sized + Send { + /// This associated type permits customizing what the "output" type of + /// deserialization is. It can be identical to `Self`. + /// + /// Typical use of this is for wrapper types used to proviate blanket trait + /// impls without hitting overlapping impl problems. + type Target; + + /// Deserialize a value from bytes aligned to a 16-byte boundary. + fn from_aligned_bytes(bytes: AlignedVec) -> Result; + /// Serialize bytes to a possibly owned byte buffer. + fn to_bytes(&self) -> Result, crate::Error>; + /// Convert this type into its final form. + fn into_target(self) -> Self::Target; +} + +/// A wrapper type that makes anything with Serde support automatically +/// implement `Cacheable`. +#[derive(Debug, Deserialize, Serialize)] +#[serde(transparent)] +pub struct SerdeCacheable { + inner: T, +} + +impl Cacheable for SerdeCacheable { + type Target = T; + + fn from_aligned_bytes(bytes: AlignedVec) -> Result { + Ok(rmp_serde::from_slice::(&bytes).map_err(ErrorKind::Decode)?) + } + + fn to_bytes(&self) -> Result, Error> { + Ok(Cow::from( + rmp_serde::to_vec(&self.inner).map_err(ErrorKind::Encode)?, + )) + } + + fn into_target(self) -> Self::Target { + self.inner + } +} + +/// All `OwnedArchive` values are cacheable. +impl Cacheable for OwnedArchive +where + A: rkyv::Archive + rkyv::Serialize> + Send, + A::Archived: for<'a> rkyv::CheckBytes> + + rkyv::Deserialize, +{ + type Target = OwnedArchive; + + fn from_aligned_bytes(bytes: AlignedVec) -> Result, Error> { + OwnedArchive::new(bytes) + } + + fn to_bytes(&self) -> Result, Error> { + Ok(Cow::from(OwnedArchive::as_bytes(self))) + } + + fn into_target(self) -> Self::Target { + self + } +} /// Either a cached client error or a (user specified) error from the callback #[derive(Debug)] @@ -44,285 +122,6 @@ impl> From> for Error { } } -#[derive(Debug)] -enum CachedResponse { - /// The cached response is fresh without an HTTP request (e.g. immutable) - FreshCache(Payload), - /// The cached response is fresh after an HTTP request (e.g. 304 not modified) - NotModified(DataWithCachePolicy), - /// There was no prior cached response or the cache was outdated - /// - /// The cache policy is `None` if it isn't storable - ModifiedOrNew(Response, Option>), -} - -/// Serialize the actual payload together with its caching information. -#[derive(Debug, Deserialize, Serialize)] -pub struct DataWithCachePolicy { - pub data: Payload, - /// Whether the response should be considered immutable. - immutable: bool, - /// The [`CachePolicy`] is used to determine if the response is fresh or stale. - /// The policy is large (448 bytes at time of writing), so we reduce the stack size by - /// boxing it. - cache_policy: Box, -} - -/// Custom caching layer over [`reqwest::Client`] using `http-cache-semantics`. -/// -/// The implementation takes inspiration from the `http-cache` crate, but adds support for running -/// an async callback on the response before caching. We use this to e.g. store a -/// parsed version of the wheel metadata and for our remote zip reader. In the latter case, we want -/// to read a single file from a remote zip using range requests (so we don't have to download the -/// entire file). We send a HEAD request in the caching layer to check if the remote file has -/// changed (and if range requests are supported), and in the callback we make the actual range -/// requests if required. -/// -/// Unlike `http-cache`, all outputs must be serde-able. Currently everything is json, but we can -/// transparently switch to a faster/smaller format. -/// -/// Again unlike `http-cache`, the caller gets full control over the cache key with the assumption -/// that it's a file. -#[derive(Debug, Clone)] -pub struct CachedClient(ClientWithMiddleware); - -impl CachedClient { - pub fn new(client: ClientWithMiddleware) -> Self { - Self(client) - } - - /// The middleware is the retry strategy - pub fn uncached(&self) -> ClientWithMiddleware { - self.0.clone() - } - - /// Make a cached request with a custom response transformation - /// - /// If a new response was received (no prior cached response or modified on the remote), the - /// response is passed through `response_callback` and only the result is cached and returned. - /// The `response_callback` is allowed to make subsequent requests, e.g. through the uncached - /// client. - #[instrument(skip_all)] - pub async fn get_cached_with_callback< - Payload: Serialize + DeserializeOwned + Send + 'static, - CallBackError, - Callback, - CallbackReturn, - >( - &self, - req: Request, - cache_entry: &CacheEntry, - cache_control: CacheControl, - response_callback: Callback, - ) -> Result> - where - Callback: FnOnce(Response) -> CallbackReturn, - CallbackReturn: Future> + Send, - { - let cached = Self::read_cache(cache_entry).await; - - let cached_response = self.send_cached(req, cache_control, cached).boxed().await?; - - let write_cache = info_span!("write_cache", file = %cache_entry.path().display()); - match cached_response { - CachedResponse::FreshCache(data) => Ok(data), - CachedResponse::NotModified(data_with_cache_policy) => { - async { - let data = - rmp_serde::to_vec(&data_with_cache_policy).map_err(ErrorKind::Encode)?; - write_atomic(cache_entry.path(), data) - .await - .map_err(ErrorKind::CacheWrite)?; - Ok(data_with_cache_policy.data) - } - .instrument(write_cache) - .await - } - CachedResponse::ModifiedOrNew(res, cache_policy) => { - let headers = CacheHeaders::from_response(res.headers().get_all("cache-control")); - let immutable = headers.is_immutable(); - - let data = response_callback(res) - .boxed() - .await - .map_err(|err| CachedClientError::Callback(err))?; - if let Some(cache_policy) = cache_policy { - let data_with_cache_policy = DataWithCachePolicy { - data, - immutable, - cache_policy, - }; - async { - fs_err::tokio::create_dir_all(cache_entry.dir()) - .await - .map_err(ErrorKind::CacheWrite)?; - let data = rmp_serde::to_vec(&data_with_cache_policy) - .map_err(ErrorKind::Encode)?; - write_atomic(cache_entry.path(), data) - .await - .map_err(ErrorKind::CacheWrite)?; - Ok(data_with_cache_policy.data) - } - .instrument(write_cache) - .await - } else { - Ok(data) - } - } - } - } - - async fn read_cache( - cache_entry: &CacheEntry, - ) -> Option> { - let read_span = info_span!("read_cache", file = %cache_entry.path().display()); - let read_result = fs_err::tokio::read(cache_entry.path()) - .instrument(read_span) - .await; - - if let Ok(cached) = read_result { - let parse_span = info_span!( - "parse_cache", - path = %cache_entry.path().display() - ); - let parse_result = tokio::task::spawn_blocking(move || { - parse_span - .in_scope(|| rmp_serde::from_slice::>(&cached)) - }) - .await - .expect("Tokio executor failed, was there a panic?"); - match parse_result { - Ok(data) => Some(data), - Err(err) => { - warn!( - "Broken cache entry at {}, removing: {err}", - cache_entry.path().display() - ); - let _ = fs_err::tokio::remove_file(&cache_entry.path()).await; - None - } - } - } else { - None - } - } - - /// `http-cache-semantics` to `reqwest` wrapper - async fn send_cached( - &self, - mut req: Request, - cache_control: CacheControl, - cached: Option>, - ) -> Result, Error> { - let url = req.url().clone(); - let cached_response = if let Some(cached) = cached { - // Avoid sending revalidation requests for immutable responses. - if cached.immutable && !cached.cache_policy.is_stale(SystemTime::now()) { - debug!("Found immutable response for: {url}"); - return Ok(CachedResponse::FreshCache(cached.data)); - } - - // Apply the cache control header, if necessary. - match cache_control { - CacheControl::None => {} - CacheControl::MustRevalidate => { - req.headers_mut().insert( - http::header::CACHE_CONTROL, - http::HeaderValue::from_static("max-age=0, must-revalidate"), - ); - } - } - - match cached - .cache_policy - .before_request(&RequestLikeReqwest(&req), SystemTime::now()) - { - BeforeRequest::Fresh(_) => { - debug!("Found fresh response for: {url}"); - CachedResponse::FreshCache(cached.data) - } - BeforeRequest::Stale { request, matches } => { - self.send_cached_handle_stale(req, url, cached, &request, matches) - .await? - } - } - } else { - debug!("No cache entry for: {url}"); - self.fresh_request(req).await? - }; - Ok(cached_response) - } - - async fn send_cached_handle_stale( - &self, - mut req: Request, - url: Url, - cached: DataWithCachePolicy, - request: &Parts, - matches: bool, - ) -> Result, Error> { - if !matches { - // This shouldn't happen; if it does, we'll override the cache. - warn!("Cached request doesn't match current request for: {url}"); - return self.fresh_request(req).await; - } - - debug!("Sending revalidation request for: {url}"); - for header in &request.headers { - req.headers_mut().insert(header.0.clone(), header.1.clone()); - } - let res = self - .0 - .execute(req.try_clone().expect("streaming requests not supported")) - .instrument(info_span!("revalidation_request", url = url.as_str())) - .await - .map_err(ErrorKind::RequestMiddlewareError)? - .error_for_status() - .map_err(ErrorKind::RequestError)?; - let after_response = cached.cache_policy.after_response( - &RequestLikeReqwest(&req), - &ResponseLikeReqwest(&res), - SystemTime::now(), - ); - match after_response { - AfterResponse::NotModified(new_policy, _parts) => { - debug!("Found not-modified response for: {url}"); - let headers = CacheHeaders::from_response(res.headers().get_all("cache-control")); - let immutable = headers.is_immutable(); - Ok(CachedResponse::NotModified(DataWithCachePolicy { - data: cached.data, - immutable, - cache_policy: Box::new(new_policy), - })) - } - AfterResponse::Modified(new_policy, _parts) => { - debug!("Found modified response for: {url}"); - Ok(CachedResponse::ModifiedOrNew( - res, - new_policy.is_storable().then(|| Box::new(new_policy)), - )) - } - } - } - - #[instrument(skip_all, fields(url = req.url().as_str()))] - async fn fresh_request(&self, req: Request) -> Result, Error> { - trace!("{} {}", req.method(), req.url()); - let res = self - .0 - .execute(req.try_clone().expect("streaming requests not supported")) - .await - .map_err(ErrorKind::RequestMiddlewareError)? - .error_for_status() - .map_err(ErrorKind::RequestError)?; - let cache_policy = CachePolicy::new(&RequestLikeReqwest(&req), &ResponseLikeReqwest(&res)); - Ok(CachedResponse::ModifiedOrNew( - res, - cache_policy.is_storable().then(|| Box::new(cache_policy)), - )) - } -} - #[derive(Debug, Clone, Copy)] pub enum CacheControl { /// Respect the `cache-control` header from the response. @@ -341,44 +140,515 @@ impl From for CacheControl { } } -#[derive(Debug)] -struct RequestLikeReqwest<'a>(&'a Request); +/// Custom caching layer over [`reqwest::Client`]. +/// +/// The implementation takes inspiration from the `http-cache` crate, but adds support for running +/// an async callback on the response before caching. We use this to e.g. store a +/// parsed version of the wheel metadata and for our remote zip reader. In the latter case, we want +/// to read a single file from a remote zip using range requests (so we don't have to download the +/// entire file). We send a HEAD request in the caching layer to check if the remote file has +/// changed (and if range requests are supported), and in the callback we make the actual range +/// requests if required. +/// +/// Unlike `http-cache`, all outputs must be serializable/deserializable in some way, by +/// implementing the `Cacheable` trait. +/// +/// Again unlike `http-cache`, the caller gets full control over the cache key with the assumption +/// that it's a file. +#[derive(Debug, Clone)] +pub struct CachedClient(ClientWithMiddleware); -impl<'a> http_cache_semantics::RequestLike for RequestLikeReqwest<'a> { - fn uri(&self) -> http::uri::Uri { - // This converts from a url::Url (as returned by reqwest::Request::url) - // to a http::uri::Uri. The conversion requires parsing, but this is - // only called ~once per HTTP request. We can afford it. - self.0 - .url() - .as_str() - .parse() - .expect("reqwest::Request::url always returns a valid URL") +impl CachedClient { + pub fn new(client: ClientWithMiddleware) -> Self { + Self(client) } - fn is_same_uri(&self, other: &http::uri::Uri) -> bool { - // At time of writing, I saw no way to cheaply compare a http::uri::Uri - // with a url::Url. We can at least avoid parsing anything, and - // Url::as_str() is free. In practice though, this routine is called - // ~once per HTTP request. We can afford it. (And it looks like - // http::uri::Uri's PartialEq implementation has been tuned.) - self.0.url().as_str() == *other + + /// The middleware is the retry strategy + pub fn uncached(&self) -> ClientWithMiddleware { + self.0.clone() } - fn method(&self) -> &http::method::Method { - self.0.method() + + /// Make a cached request with a custom response transformation + /// while using serde to (de)serialize cached responses. + /// + /// If a new response was received (no prior cached response or modified + /// on the remote), the response is passed through `response_callback` and + /// only the result is cached and returned. The `response_callback` is + /// allowed to make subsequent requests, e.g. through the uncached client. + #[instrument(skip_all)] + pub async fn get_serde< + Payload: Serialize + DeserializeOwned + Send + 'static, + CallBackError, + Callback, + CallbackReturn, + >( + &self, + req: Request, + cache_entry: &CacheEntry, + cache_control: CacheControl, + response_callback: Callback, + ) -> Result> + where + Callback: FnOnce(Response) -> CallbackReturn + Send, + CallbackReturn: Future> + Send, + { + let payload = self + .get_cacheable(req, cache_entry, cache_control, move |resp| async { + let payload = response_callback(resp).await?; + Ok(SerdeCacheable { inner: payload }) + }) + .await?; + Ok(payload) } - fn headers(&self) -> &http::header::HeaderMap { - self.0.headers() + + /// Make a cached request with a custom response transformation while using + /// the `Cacheable` trait to (de)serialize cached responses. + /// + /// The purpose of this routine is the use of `Cacheable`. Namely, it + /// generalizes over (de)serialization such that mechanisms other than + /// serde (such as rkyv) can be used to manage (de)serialization of cached + /// data. + /// + /// If a new response was received (no prior cached response or modified + /// on the remote), the response is passed through `response_callback` and + /// only the result is cached and returned. The `response_callback` is + /// allowed to make subsequent requests, e.g. through the uncached client. + #[instrument(skip_all)] + pub async fn get_cacheable( + &self, + req: Request, + cache_entry: &CacheEntry, + cache_control: CacheControl, + response_callback: Callback, + ) -> Result> + where + Callback: FnOnce(Response) -> CallbackReturn, + CallbackReturn: Future> + Send, + { + let cached_response = match Self::read_cache(cache_entry).await { + Some(cached) => self.send_cached(req, cache_control, cached).boxed().await?, + None => { + debug!("No cache entry for: {}", req.url()); + self.fresh_request(req).await? + } + }; + match cached_response { + CachedResponse::FreshCache(cached) => Ok(Payload::from_aligned_bytes(cached.data)?), + CachedResponse::NotModified { cached, new_policy } => { + let refresh_cache = + info_span!("refresh_cache", file = %cache_entry.path().display()); + async { + let data_with_cache_policy_bytes = + DataWithCachePolicy::serialize(&new_policy, &cached.data)?; + write_atomic(cache_entry.path(), data_with_cache_policy_bytes) + .await + .map_err(ErrorKind::CacheWrite)?; + Ok(Payload::from_aligned_bytes(cached.data)?) + } + .instrument(refresh_cache) + .await + } + CachedResponse::ModifiedOrNew { + response, + cache_policy, + } => { + let new_cache = info_span!("new_cache", file = %cache_entry.path().display()); + let data = response_callback(response) + .boxed() + .await + .map_err(|err| CachedClientError::Callback(err))?; + let Some(cache_policy) = cache_policy else { + return Ok(data.into_target()); + }; + async { + fs_err::tokio::create_dir_all(cache_entry.dir()) + .await + .map_err(ErrorKind::CacheWrite)?; + let data_with_cache_policy_bytes = + DataWithCachePolicy::serialize(&cache_policy, &data.to_bytes()?)?; + write_atomic(cache_entry.path(), data_with_cache_policy_bytes) + .await + .map_err(ErrorKind::CacheWrite)?; + Ok(data.into_target()) + } + .instrument(new_cache) + .await + } + } + } + + async fn read_cache(cache_entry: &CacheEntry) -> Option { + let span = info_span!("read_and_parse_cache", file = %cache_entry.path().display()); + match span + .in_scope(|| DataWithCachePolicy::from_path_async(cache_entry.path())) + .await + { + Ok(data) => Some(data), + Err(err) => { + warn!( + "Broken cache entry at {}, removing: {err}", + cache_entry.path().display() + ); + let _ = fs_err::tokio::remove_file(&cache_entry.path()).await; + None + } + } + } + + /// Send a request given that we have a (possibly) stale cached response. + /// + /// If the cached response is valid but stale, then this will attempt a + /// revalidation request. + async fn send_cached( + &self, + mut req: Request, + cache_control: CacheControl, + cached: DataWithCachePolicy, + ) -> Result { + // Apply the cache control header, if necessary. + match cache_control { + CacheControl::None => {} + CacheControl::MustRevalidate => { + req.headers_mut().insert( + http::header::CACHE_CONTROL, + http::HeaderValue::from_static("no-cache"), + ); + } + } + Ok(match cached.cache_policy.before_request(&mut req) { + BeforeRequest::Fresh => { + debug!("Found fresh response for: {}", req.url()); + CachedResponse::FreshCache(cached) + } + BeforeRequest::Stale(new_cache_policy_builder) => { + debug!("Found stale response for: {}", req.url()); + self.send_cached_handle_stale(req, cached, new_cache_policy_builder) + .await? + } + BeforeRequest::NoMatch => { + // This shouldn't happen; if it does, we'll override the cache. + warn!( + "Cached request doesn't match current request for: {}", + req.url() + ); + self.fresh_request(req).await? + } + }) + } + + async fn send_cached_handle_stale( + &self, + req: Request, + cached: DataWithCachePolicy, + new_cache_policy_builder: CachePolicyBuilder, + ) -> Result { + let url = req.url().clone(); + debug!("Sending revalidation request for: {url}"); + let response = self + .0 + .execute(req) + .instrument(info_span!("revalidation_request", url = url.as_str())) + .await + .map_err(ErrorKind::RequestMiddlewareError)? + .error_for_status() + .map_err(ErrorKind::RequestError)?; + match cached + .cache_policy + .after_response(new_cache_policy_builder, &response) + { + AfterResponse::NotModified(new_policy) => { + debug!("Found not-modified response for: {url}"); + Ok(CachedResponse::NotModified { + cached, + new_policy: Box::new(new_policy), + }) + } + AfterResponse::Modified(new_policy) => { + debug!("Found modified response for: {url}"); + Ok(CachedResponse::ModifiedOrNew { + response, + cache_policy: new_policy + .to_archived() + .is_storable() + .then(|| Box::new(new_policy)), + }) + } + } + } + + #[instrument(skip_all, fields(url = req.url().as_str()))] + async fn fresh_request(&self, req: Request) -> Result { + trace!("Sending fresh {} request for {}", req.method(), req.url()); + let cache_policy_builder = CachePolicyBuilder::new(&req); + let response = self + .0 + .execute(req) + .await + .map_err(ErrorKind::RequestMiddlewareError)? + .error_for_status() + .map_err(ErrorKind::RequestError)?; + let cache_policy = cache_policy_builder.build(&response); + Ok(CachedResponse::ModifiedOrNew { + response, + cache_policy: cache_policy + .to_archived() + .is_storable() + .then(|| Box::new(cache_policy)), + }) } } #[derive(Debug)] -struct ResponseLikeReqwest<'a>(&'a Response); +enum CachedResponse { + /// The cached response is fresh without an HTTP request (e.g. age < max-age). + FreshCache(DataWithCachePolicy), + /// The cached response is fresh after an HTTP request (e.g. 304 not modified) + NotModified { + /// The cached response (with its old cache policy). + cached: DataWithCachePolicy, + /// The new [`CachePolicy`] is used to determine if the response + /// is fresh or stale when making subsequent requests for the same + /// resource. This policy should overwrite the old policy associated + /// with the cached response. In particular, this new policy is derived + /// from data received in a revalidation response, which might change + /// the parameters of cache behavior. + /// + /// The policy is large (352 bytes at time of writing), so we reduce + /// the stack size by boxing it. + new_policy: Box, + }, + /// There was no prior cached response or the cache was outdated + /// + /// The cache policy is `None` if it isn't storable + ModifiedOrNew { + /// The response received from the server. + response: Response, + /// The [`CachePolicy`] is used to determine if the response is fresh or + /// stale when making subsequent requests for the same resource. + /// + /// The policy is large (352 bytes at time of writing), so we reduce + /// the stack size by boxing it. + cache_policy: Option>, + }, +} -impl<'a> http_cache_semantics::ResponseLike for ResponseLikeReqwest<'a> { - fn status(&self) -> http::status::StatusCode { - self.0.status() +/// Represents an arbitrary data blob with an associated HTTP cache policy. +/// +/// The cache policy is used to determine whether the data blob is stale or +/// not. +/// +/// # Format +/// +/// This type encapsulates the format for how blobs of data are stored on +/// disk. The format is very simple. First, the blob of data is written as-is. +/// Second, the archived representation of a `CachePolicy` is written. Thirdly, +/// the length, in bytes, of the archived `CachePolicy` is written as a 64-bit +/// little endian integer. +/// +/// Reading the format is done via an `AlignedVec` so that `rkyv` can correctly +/// read the archived representation of the data blob. The cache policy is +/// split into its own `AlignedVec` allocation. +/// +/// # Future ideas +/// +/// This format was also chosen because it should in theory permit rewriting +/// the cache policy without needing to rewrite the data blob if the blob has +/// not changed. For example, this case occurs when a revalidation request +/// responds with HTTP 304 NOT MODIFIED. At time of writing, this is not yet +/// implemented because 1) the synchronization specifics of mutating a cache +/// file have not been worked out and 2) it's not clear if it's a win. +/// +/// An alternative format would be to write the cache policy and the +/// blob in two distinct files. This would avoid needing to worry about +/// synchronization, but it means reading two files instead of one for every +/// cached response in the fast path. It's unclear whether it's worth it. +/// (Experiments have not yet been done.) +/// +/// Another approach here would be to memory map the file and rejigger +/// `OwnedArchive` (or create a new type) that works with a memory map instead +/// of an `AlignedVec`. This will require care to ensure alignment is handled +/// correctly. This approach has not been litigated yet. I did not start with +/// it because experiments with ripgrep have tended to show that (on Linux) +/// memory mapping a bunch of small files ends up being quite a bit slower than +/// just reading them on to the heap. +#[derive(Debug)] +pub struct DataWithCachePolicy { + pub data: AlignedVec, + cache_policy: OwnedArchive, +} + +impl DataWithCachePolicy { + /// Loads cached data and its associated HTTP cache policy from the given + /// file path in an asynchronous fashion (via `spawn_blocking`). + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format or if reading the + /// file given fails, then this returns an error. + async fn from_path_async(path: &Path) -> Result { + let path = path.to_path_buf(); + tokio::task::spawn_blocking(move || DataWithCachePolicy::from_path_sync(&path)) + .await + // This just forwards panics from the closure. + .unwrap() } - fn headers(&self) -> &http::header::HeaderMap { - self.0.headers() + + /// Loads cached data and its associated HTTP cache policy from the given + /// file path in a synchronous fashion. + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format or if reading the + /// file given fails, then this returns an error. + fn from_path_sync(path: &Path) -> Result { + let file = fs_err::File::open(path).map_err(ErrorKind::Io)?; + // Note that we don't wrap our file in a buffer because it will just + // get passed to AlignedVec::extend_from_reader, which doesn't benefit + // from an intermediary buffer. In effect, the AlignedVec acts as the + // buffer. + DataWithCachePolicy::from_reader(file) + } + + /// Loads cached data and its associated HTTP cache policy from the given + /// reader. + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format or if the reader + /// fails, then this returns an error. + pub fn from_reader(mut rdr: impl std::io::Read) -> Result { + let mut aligned_bytes = rkyv::util::AlignedVec::new(); + aligned_bytes + .extend_from_reader(&mut rdr) + .map_err(ErrorKind::Io)?; + DataWithCachePolicy::from_aligned_bytes(aligned_bytes) + } + + /// Loads cached data and its associated HTTP cache policy form an in + /// memory byte buffer. + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format, then this + /// returns an error. + fn from_aligned_bytes(mut bytes: AlignedVec) -> Result { + let cache_policy = DataWithCachePolicy::deserialize_cache_policy(&mut bytes)?; + Ok(DataWithCachePolicy { + data: bytes, + cache_policy, + }) + } + + /// Serializes the given cache policy and arbitrary data blob to an in + /// memory byte buffer. + /// + /// # Errors + /// + /// If there was a problem converting the given cache policy to its + /// serialized representation, then this routine will return an error. + fn serialize(cache_policy: &CachePolicy, data: &[u8]) -> Result, Error> { + let mut buf = vec![]; + DataWithCachePolicy::serialize_to_writer(cache_policy, data, &mut buf)?; + Ok(buf) + } + + /// Serializes the given cache policy and arbitrary data blob to the given + /// writer. + /// + /// # Errors + /// + /// If there was a problem converting the given cache policy to its + /// serialized representation or if the writer returns an error, then + /// this routine will return an error. + fn serialize_to_writer( + cache_policy: &CachePolicy, + data: &[u8], + mut wtr: impl std::io::Write, + ) -> Result<(), Error> { + let cache_policy_archived = OwnedArchive::from_unarchived(cache_policy)?; + let cache_policy_bytes = OwnedArchive::as_bytes(&cache_policy_archived); + wtr.write_all(data).map_err(ErrorKind::Io)?; + wtr.write_all(cache_policy_bytes).map_err(ErrorKind::Io)?; + let len = u64::try_from(cache_policy_bytes.len()).map_err(|_| { + let msg = format!( + "failed to represent {} (length of cache policy) in a u64", + cache_policy_bytes.len() + ); + ErrorKind::Io(std::io::Error::other(msg)) + })?; + wtr.write_all(&len.to_le_bytes()).map_err(ErrorKind::Io)?; + Ok(()) + } + + /// Deserializes a `OwnedArchive` off the end of the given + /// aligned bytes. Upon success, the given bytes will only contain the + /// data itself. The bytes representing the cached policy will have been + /// removed. + /// + /// # Errors + /// + /// This returns an error if the cache policy could not be deserialized + /// from the end of the given bytes. + fn deserialize_cache_policy( + bytes: &mut AlignedVec, + ) -> Result, Error> { + let len = DataWithCachePolicy::deserialize_cache_policy_len(bytes)?; + let cache_policy_bytes_start = bytes.len() - (len + 8); + let cache_policy_bytes = &bytes[cache_policy_bytes_start..][..len]; + let mut cache_policy_bytes_aligned = AlignedVec::with_capacity(len); + cache_policy_bytes_aligned.extend_from_slice(cache_policy_bytes); + assert!( + cache_policy_bytes_start <= bytes.len(), + "slicing cache policy should result in a truncation" + ); + // Technically this will keep the extra capacity used to store the + // cache policy around. But it should be pretty small, and it saves a + // realloc. (It's unclear whether that matters more or less than the + // extra memory usage.) + bytes.resize(cache_policy_bytes_start, 0); + OwnedArchive::new(cache_policy_bytes_aligned) + } + + /// Deserializes the length, in bytes, of the cache policy given a complete + /// serialized byte buffer of a `DataWithCachePolicy`. + /// + /// Upon success, callers are guaranteed that + /// `&bytes[bytes.len() - (len + 8)..][..len]` will not panic. + /// + /// # Errors + /// + /// This returns an error if the length could not be read as a `usize` or is + /// otherwise known to be invalid. (For example, it is a length that is bigger + /// than `bytes.len()`.) + fn deserialize_cache_policy_len(bytes: &[u8]) -> Result { + let Some(cache_policy_len_start) = bytes.len().checked_sub(8) else { + let msg = format!( + "data-with-cache-policy buffer should be at least 8 bytes \ + in length, but is {} bytes", + bytes.len(), + ); + return Err(ErrorKind::ArchiveRead(msg).into()); + }; + let cache_policy_len_bytes = <[u8; 8]>::try_from(&bytes[cache_policy_len_start..]) + .expect("cache policy length is 8 bytes"); + let len_u64 = u64::from_le_bytes(cache_policy_len_bytes); + let Ok(len_usize) = usize::try_from(len_u64) else { + let msg = format!( + "data-with-cache-policy has cache policy length of {}, \ + but overflows usize", + len_u64, + ); + return Err(ErrorKind::ArchiveRead(msg).into()); + }; + if bytes.len() < len_usize + 8 { + let msg = format!( + "invalid cache entry: data-with-cache-policy has cache policy length of {}, \ + but total buffer size is {}", + len_usize, + bytes.len(), + ); + return Err(ErrorKind::ArchiveRead(msg).into()); + } + Ok(len_usize) } } diff --git a/crates/puffin-client/src/flat_index.rs b/crates/puffin-client/src/flat_index.rs index f62e01409..816fbd2bb 100644 --- a/crates/puffin-client/src/flat_index.rs +++ b/crates/puffin-client/src/flat_index.rs @@ -132,7 +132,7 @@ impl<'a> FlatIndexClient<'a> { .instrument(info_span!("parse_flat_index_html", url = % url)) }; let files = cached_client - .get_cached_with_callback( + .get_serde( flat_index_request, &cache_entry, cache_control, diff --git a/crates/puffin-client/src/httpcache/control.rs b/crates/puffin-client/src/httpcache/control.rs new file mode 100644 index 000000000..343494540 --- /dev/null +++ b/crates/puffin-client/src/httpcache/control.rs @@ -0,0 +1,775 @@ +use std::collections::HashSet; + +use crate::rkyvutil::OwnedArchive; + +/// Represents values for relevant cache-control directives. +/// +/// This does include some directives that we don't use mostly because they are +/// trivial to support. (For example, `must-understand` at time of writing is +/// not used in our HTTP cache semantics. Neither is `proxy-revalidate` since +/// we are not a proxy.) +#[derive( + Clone, + Debug, + Default, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + rkyv::Archive, + rkyv::Deserialize, + rkyv::Serialize, +)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +pub struct CacheControl { + // directives for requests and responses + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-max-age + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-max-age-2 + pub max_age_seconds: Option, + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-cache + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-cache-2 + pub no_cache: bool, + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-store + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-store-2 + pub no_store: bool, + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-transform + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-transform-2 + pub no_transform: bool, + + // request-only directives + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-max-stale + pub max_stale_seconds: Option, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-min-fresh + pub min_fresh_seconds: Option, + + // response-only directives + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-only-if-cached + pub only_if_cached: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-must-revalidate + pub must_revalidate: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-must-understand + pub must_understand: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-private + pub private: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-proxy-revalidate + pub proxy_revalidate: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-public + pub public: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-s-maxage + pub s_maxage_seconds: Option, + /// https://httpwg.org/specs/rfc8246.html + pub immutable: bool, +} + +impl CacheControl { + /// Convert this to an owned archive value. + pub fn to_archived(&self) -> OwnedArchive { + // There's no way (other than OOM) for serializing this type to fail. + OwnedArchive::from_unarchived(self).expect("all possible values can be archived") + } +} + +impl<'b, B: 'b + ?Sized + AsRef<[u8]>> FromIterator<&'b B> for CacheControl { + fn from_iter>(it: T) -> CacheControl { + CacheControl::from_iter(CacheControlParser::new(it)) + } +} + +impl FromIterator for CacheControl { + fn from_iter>(it: T) -> CacheControl { + fn parse_int(value: &[u8]) -> Option { + if !value.iter().all(u8::is_ascii_digit) { + return None; + } + std::str::from_utf8(value).ok()?.parse().ok() + } + + let mut cc = CacheControl::default(); + for ccd in it { + // Note that when we see invalid directive values, we follow [RFC + // 9111 S4.2.1]. It says that invalid cache-control directives + // should result in treating the response as stale. (Which we + // accomplished by setting `must_revalidate` to `true`.) + // + // [RFC 9111 S4.2.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.1 + match &*ccd.name { + // request + response directives + "max-age" => match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.max_age_seconds = Some(seconds), + }, + "no-cache" => cc.no_cache = true, + "no-store" => cc.no_store = true, + "no-transform" => cc.no_transform = true, + // request-only directives + "max-stale" => { + // As per [RFC 9111 S5.2.1.2], "If no value is assigned to + // max-stale, then the client will accept a stale response + // of any age." We implement that by just using the maximum + // number of seconds. + // + // [RFC 9111 S5.2.1.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.2 + if ccd.value.is_empty() { + cc.max_stale_seconds = Some(u64::MAX); + } else { + match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.max_stale_seconds = Some(seconds), + } + } + } + "min-fresh" => match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.min_fresh_seconds = Some(seconds), + }, + "only-if-cached" => cc.only_if_cached = true, + "must-revalidate" => cc.must_revalidate = true, + "must-understand" => cc.must_understand = true, + "private" => cc.private = true, + "proxy-revalidate" => cc.proxy_revalidate = true, + "public" => cc.public = true, + "s-maxage" => match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.s_maxage_seconds = Some(seconds), + }, + "immutable" => cc.immutable = true, + _ => {} + } + } + cc + } +} + +/// A parser for the HTTP `Cache-Control` header. +/// +/// The parser is mostly defined across multiple parts of multiple RFCs. +/// Namely, [RFC 9110 S5.6.2] says how to parse the names (or "keys") of each +/// directive (whose format is a "token"). [RFC 9110 S5.6.4] says how to parse +/// quoted values. And finally, [RFC 9111 Appendex A] gives the ABNF for the +/// overall header value. +/// +/// This parser accepts an iterator of anything that can be cheaply converted +/// to a byte string (e.g., `http::header::HeaderValue`). Directives are parsed +/// from zero or more of these byte strings. Parsing cannot return an error, +/// but if something unexpected is found, the rest of that header value is +/// skipped. +/// +/// Duplicate directives provoke an automatic insertion of `must-revalidate`, +/// as implied by [RFC 9111 S4.2.1], to ensure that the client will talk to the +/// server before using anything in case. +/// +/// This parser handles a bit more than what we actually need in +/// `puffin-client`. For example, we don't need to handle quoted values at all +/// since either don't use or care about values that require quoted. With that +/// said, the parser handles these because it wasn't that much extra work to do +/// so and just generally seemed like good sense. (If we didn't handle them and +/// parsed them incorrectly, that might mean parsing subsequent directives that +/// we do care about incorrectly.) +/// +/// [RFC 9110 S5.6.2]: https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens +/// [RFC 9110 S5.6.4]: https://www.rfc-editor.org/rfc/rfc9110.html#name-quoted-strings +/// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf +/// [RFC 9111 S4.2.1]: https://www.rfc-editor.org/rfc/rfc9111.html#calculating.freshness.lifetime +struct CacheControlParser<'b, I> { + cur: &'b [u8], + directives: I, + seen: HashSet, +} + +impl<'b, B: 'b + ?Sized + AsRef<[u8]>, I: Iterator> CacheControlParser<'b, I> { + /// Create a new parser of zero or more `Cache-Control` header values. The + /// given iterator should yield elements that satisfy `AsRef<[u8]>`. + fn new>(headers: II) -> CacheControlParser<'b, I> { + let mut directives = headers.into_iter(); + let cur = directives.next().map(|h| h.as_ref()).unwrap_or(b""); + CacheControlParser { + cur, + directives, + seen: HashSet::new(), + } + } + + /// Parses a token according to [RFC 9110 S5.6.2]. + /// + /// If no token is found at the current position, then this returns `None`. + /// Usually this indicates an invalid cache-control directive. + /// + /// This does not trim whitespace before or after the token. + /// + /// [RFC 9110 S5.6.2]: https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens + fn parse_token(&mut self) -> Option { + /// Returns true when the given byte can appear in a token, as + /// defined by [RFC 9110 S5.6.2]. + /// + /// [RFC 9110 S5.6.2]: https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens + fn is_token_byte(byte: u8) -> bool { + matches!( + byte, + | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' + | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~' + | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z', + ) + } + let mut end = 0; + while self.cur.get(end).copied().map_or(false, is_token_byte) { + end += 1; + } + if end == 0 { + None + } else { + let (token, rest) = self.cur.split_at(end); + self.cur = rest; + // This can't fail because `end` is only incremented when the + // current byte is a valid token byte. And all valid token bytes + // are ASCII and thus valid UTF-8. + Some(String::from_utf8(token.to_vec()).expect("all valid token bytes are valid UTF-8")) + } + } + + /// Looks for an `=` as per [RFC 9111 Appendix A] which indicates that a + /// cache directive has a value. + /// + /// This returns true if one was found. In which case, the `=` is consumed. + /// + /// This does not trim whitespace before or after the token. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn maybe_parse_equals(&mut self) -> bool { + if self.cur.first().map_or(false, |&byte| byte == b'=') { + self.cur = &self.cur[1..]; + true + } else { + false + } + } + + /// Parses a directive value as either an unquoted token or a quoted string + /// as per [RFC 9111 Appendix A]. + /// + /// If a valid value could not be found (for example, end-of-input or an + /// opening quote without a closing quote), then `None` is returned. In + /// this case, one should consider the cache-control header invalid. + /// + /// This does not trim whitespace before or after the token. + /// + /// Note that the returned value is *not* guaranteed to be valid UTF-8. + /// Namely, it is possible for a quoted string to contain invalid UTF-8. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn parse_value(&mut self) -> Option> { + if *self.cur.first()? == b'"' { + self.cur = &self.cur[1..]; + self.parse_quoted_string() + } else { + self.parse_token().map(|s| s.into_bytes()) + } + } + + /// Parses a quoted string as per [RFC 9110 S5.6.4]. + /// + /// This assumes the opening quote has already been consumed. + /// + /// If an invalid quoted string was found (e.g., no closing quote), then + /// `None` is returned. An empty value may be returned. + /// + /// Note that the returned value is *not* guaranteed to be valid UTF-8. + /// Namely, it is possible for a quoted string to contain invalid UTF-8. + /// + /// [RFC 9110 S5.6.4]: https://www.rfc-editor.org/rfc/rfc9110.html#name-quoted-strings + fn parse_quoted_string(&mut self) -> Option> { + fn is_qdtext_byte(byte: u8) -> bool { + matches!(byte, b'\t' | b' ' | 0x21 | 0x23..=0x5B | 0x5D..=0x7E | 0x80..=0xFF) + } + fn is_quoted_pair_byte(byte: u8) -> bool { + matches!(byte, b'\t' | b' ' | 0x21..=0x7E | 0x80..=0xFF) + } + let mut value = vec![]; + while !self.cur.is_empty() { + let byte = self.cur[0]; + self.cur = &self.cur[1..]; + if byte == b'"' { + return Some(value); + } else if byte == b'\\' { + let byte = *self.cur.first()?; + self.cur = &self.cur[1..]; + // If we saw an escape but didn't see a valid + // escaped byte, then we treat this value as + // invalid. + if !is_quoted_pair_byte(byte) { + return None; + } + value.push(byte); + } else if is_qdtext_byte(byte) { + value.push(byte); + } else { + break; + } + } + // If we got here, it means we hit end-of-input before seeing a closing + // quote. So we treat this as invalid and return `None`. + None + } + + /// Looks for a `,` as per [RFC 9111 Appendix A]. If one is found, then it + /// is consumed and this returns true. + /// + /// This does not trim whitespace before or after the token. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn maybe_parse_directive_delimiter(&mut self) -> bool { + if self.cur.first().map_or(false, |&byte| byte == b',') { + self.cur = &self.cur[1..]; + true + } else { + false + } + } + + /// [RFC 9111 Appendix A] says that optional whitespace may appear between + /// cache directives. We actually also allow whitespace to appear before + /// the first directive and after the last directive. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn skip_whitespace(&mut self) { + while self.cur.first().map_or(false, u8::is_ascii_whitespace) { + self.cur = &self.cur[1..]; + } + } + + fn emit_directive( + &mut self, + directive: CacheControlDirective, + ) -> Option { + let duplicate = !self.seen.insert(directive.name.clone()); + if duplicate { + self.emit_revalidation() + } else { + Some(directive) + } + } + + fn emit_revalidation(&mut self) -> Option { + if self.seen.insert("must-revalidate".to_string()) { + Some(CacheControlDirective::must_revalidate()) + } else { + // If we've already emitted a must-revalidate + // directive, then don't do it again. + None + } + } +} + +impl<'b, B: 'b + ?Sized + AsRef<[u8]>, I: Iterator> Iterator + for CacheControlParser<'b, I> +{ + type Item = CacheControlDirective; + + fn next(&mut self) -> Option { + loop { + if self.cur.is_empty() { + self.cur = self.directives.next().map(|h| h.as_ref())?; + } + while !self.cur.is_empty() { + self.skip_whitespace(); + let Some(mut name) = self.parse_token() else { + // If we fail to parse a token, then this header value is + // either corrupt or empty. So skip the rest of it. + let invalid = !self.cur.is_empty(); + self.cur = b""; + // But if it was invalid, force revalidation. + if invalid { + if let Some(d) = self.emit_revalidation() { + return Some(d); + } + } + break; + }; + name.make_ascii_lowercase(); + if !self.maybe_parse_equals() { + // Eat up whitespace and the next delimiter. We don't care + // if we find a terminator. + self.skip_whitespace(); + self.maybe_parse_directive_delimiter(); + let directive = CacheControlDirective { + name, + value: vec![], + }; + match self.emit_directive(directive) { + None => continue, + Some(d) => return Some(d), + } + } + let Some(value) = self.parse_value() else { + // If we expected a value (we saw an =) but couldn't find a + // valid value, then this header value is probably corrupt. + // So skip the rest of it. + self.cur = b""; + match self.emit_revalidation() { + None => break, + Some(d) => return Some(d), + } + }; + // Eat up whitespace and the next delimiter. We don't care if + // we find a terminator. + self.skip_whitespace(); + self.maybe_parse_directive_delimiter(); + let directive = CacheControlDirective { name, value }; + match self.emit_directive(directive) { + None => continue, + Some(d) => return Some(d), + } + } + } + } +} + +/// A single directive from the `Cache-Control` header. +#[derive(Debug, Eq, PartialEq)] +struct CacheControlDirective { + /// The name of the directive. + name: String, + /// A possibly empty value. + /// + /// Note that directive values may contain invalid UTF-8. (Although they + /// cannot actually contain arbitrary bytes. For example, NUL bytes, among + /// others, are not allowed.) + value: Vec, +} + +impl CacheControlDirective { + /// Returns a `must-revalidate` directive. This is useful for forcing a + /// cache decision that the response is stale, and thus the server should + /// be consulted for whether the cached response ought to be used or not. + fn must_revalidate() -> CacheControlDirective { + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![], + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cache_control_token() { + let cc: CacheControl = CacheControlParser::new(["no-cache"]).collect(); + assert!(cc.no_cache); + assert!(!cc.must_revalidate); + } + + #[test] + fn cache_control_max_age() { + let cc: CacheControl = CacheControlParser::new(["max-age=60"]).collect(); + assert_eq!(Some(60), cc.max_age_seconds); + assert!(!cc.must_revalidate); + } + + // [RFC 9111 S5.2.1.1] says that client MUST NOT quote max-age, but we + // support parsing it that way anyway. + // + // [RFC 9111 S5.2.1.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.1 + #[test] + fn cache_control_max_age_quoted() { + let cc: CacheControl = CacheControlParser::new([r#"max-age="60""#]).collect(); + assert_eq!(Some(60), cc.max_age_seconds); + assert!(!cc.must_revalidate); + } + + #[test] + fn cache_control_max_age_invalid() { + let cc: CacheControl = CacheControlParser::new(["max-age=6a0"]).collect(); + assert_eq!(None, cc.max_age_seconds); + assert!(cc.must_revalidate); + } + + #[test] + fn cache_control_immutable() { + let cc: CacheControl = CacheControlParser::new(["max-age=31536000, immutable"]).collect(); + assert_eq!(Some(31_536_000), cc.max_age_seconds); + assert!(cc.immutable); + assert!(!cc.must_revalidate); + } + + #[test] + fn cache_control_unrecognized() { + let cc: CacheControl = CacheControlParser::new(["lion,max-age=60,zebra"]).collect(); + assert_eq!(Some(60), cc.max_age_seconds); + } + + #[test] + fn cache_control_invalid_squashes_remainder() { + let cc: CacheControl = CacheControlParser::new(["no-cache,\x00,max-age=60"]).collect(); + // The invalid data doesn't impact things before it. + assert!(cc.no_cache); + // The invalid data precludes parsing anything after. + assert_eq!(None, cc.max_age_seconds); + // The invalid contents should force revalidation. + assert!(cc.must_revalidate); + } + + #[test] + fn cache_control_invalid_squashes_remainder_but_not_other_header_values() { + let cc: CacheControl = + CacheControlParser::new(["no-cache,\x00,max-age=60", "max-stale=30"]).collect(); + // The invalid data doesn't impact things before it. + assert!(cc.no_cache); + // The invalid data precludes parsing anything after + // in the same header value, but not in other + // header values. + assert_eq!(Some(30), cc.max_stale_seconds); + // The invalid contents should force revalidation. + assert!(cc.must_revalidate); + } + + #[test] + fn cache_control_parse_token() { + let directives = CacheControlParser::new(["no-cache"]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }] + ); + } + + #[test] + fn cache_control_parse_token_to_token_value() { + let directives = CacheControlParser::new(["max-age=60"]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }] + ); + } + + #[test] + fn cache_control_parse_token_to_quoted_string() { + let directives = + CacheControlParser::new([r#"private="cookie,x-something-else""#]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "private".to_string(), + value: b"cookie,x-something-else".to_vec(), + }] + ); + } + + #[test] + fn cache_control_parse_token_to_quoted_string_with_escape() { + let directives = + CacheControlParser::new([r#"private="something\"crazy""#]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "private".to_string(), + value: br#"something"crazy"#.to_vec(), + }] + ); + } + + #[test] + fn cache_control_parse_multiple_directives() { + let header = r#"max-age=60, no-cache, private="cookie", no-transform"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "private".to_string(), + value: b"cookie".to_vec(), + }, + CacheControlDirective { + name: "no-transform".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_multiple_directives_across_multiple_header_values() { + let headers = [ + r#"max-age=60, no-cache"#, + r#"private="cookie""#, + r#"no-transform"#, + ]; + let directives = CacheControlParser::new(headers).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "private".to_string(), + value: b"cookie".to_vec(), + }, + CacheControlDirective { + name: "no-transform".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_one_header_invalid() { + let headers = [ + r#"max-age=60, no-cache"#, + r#", private="cookie""#, + r#"no-transform"#, + ]; + let directives = CacheControlParser::new(headers).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "no-transform".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_invalid_directive_drops_remainder() { + let header = r#"max-age=60, no-cache, ="cookie", no-transform"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_name_normalized() { + let header = r#"MAX-AGE=60"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + },] + ); + } + + // When a duplicate directive is found, we keep the first one + // and add in a `must-revalidate` directive to indicate that + // things are stale and the client should do a re-check. + #[test] + fn cache_control_parse_duplicate_directives() { + let header = r#"max-age=60, no-cache, max-age=30"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_duplicate_directives_across_headers() { + let headers = [r#"max-age=60, no-cache"#, r#"max-age=30"#]; + let directives = CacheControlParser::new(headers).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } + + // Tests that we don't emit must-revalidate multiple times + // even when something is duplicated multiple times. + #[test] + fn cache_control_parse_duplicate_redux() { + let header = r#"max-age=60, no-cache, no-cache, max-age=30"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } +} diff --git a/crates/puffin-client/src/httpcache/mod.rs b/crates/puffin-client/src/httpcache/mod.rs new file mode 100644 index 000000000..8e1344865 --- /dev/null +++ b/crates/puffin-client/src/httpcache/mod.rs @@ -0,0 +1,1358 @@ +/*! +A somewhat simplistic implementation of HTTP cache semantics. + +This implementation was guided by the following things: + +* RFCs 9110 and 9111. +* The `http-cache-semantics` crate. (The implementation here is completely +different, but the source of `http-cache-semantics` helped guide the +implementation here and understanding of HTTP caching.) +* A desire for our cache policy to support zero-copy deserialization. That +is, we want the cached response fast path (where no revalidation request is +necessary) to avoid any costly deserialization for the cache policy at all. + +# Flow + +While one has to read the relevant RFCs to get a full understanding of HTTP +caching, doing so is... difficult to say the least. It is at the very least +not quick to do because the semantics are scattered all over the place. But, I +think we can do a quick overview here. + +Let's start with the obvious. HTTP caching exists to avoid network requests, +and, if a request is unavoidable, bandwidth. The central actor in HTTP +caching is the `Cache-Control` header, which can exist on *both* requests and +resonses. The value of this header is a list of directives that control caching +behavior. They can outright disable it (`no-store`), force cache invalidation +(`no-cache`) or even permit the cache to return responses that are explicitly +stale (`max-stale`). + +The main thing that typically drives cache interactions is `max-age`. When set +on a response, this means that the server is willing to let clients hold on to +a response for up to the amount of time in `max-age` before the client must ask +the server for a fresh response. In our case, the main utility of `max-age` is +two fold: + +* PyPI sets a `max-age` of 600 seconds (10 minutes) on its responses. As long +as our cached responses have an age less than this, we can completely avoid +talking to PyPI at all when we need access to the full set of versions for a +package. +* Most other assets, like wheels, are forever immutable. They will never +change. So servers will typically set a very high `max-age`, which means we +will almost never need to ask the server for permission to reuse our cached +wheel. + +When a cached response exceeds the `max-age` configured on a response, then +we call that response stale. Generally speaking, we won't return responses +from the cache that are known to be stale. (This can be overridden in the +request by adding a `max-stale` cache-control directive, but nothing in Puffin +does this at time of writing.) When a response is stale, we don't necessarily +need to give up completely. It is at this point that we can send something +called a re-validation request. + +A re-validation request includes with it some metadata (usually an "entity tag" +or `etag` for short) that was on the cached response (which is now stale). +When we send this request, the server can compare it with its most up to date +version of the resource. If its entity tag matches the one we gave it (among +other possible criteria), then the server can skip returning the body and +instead just return a small HTTP 304 NOT MODIFIED response. When we get this +type of response, it's the server telling us that our cached response which we +*thought* was stale is no longer stale. It's fresh again and we needn't get a +new copy. We will need to update our stored `CachePolicy` though, since the +HTTP 304 NOT MODIFIED response we got might included updated metadata relevant +to the behavior of caching (like a new `Age` header). + +# Scope + +In general, the cache semantics implemented below are targeted toward Puffin's +use case: a private client cache for custom data objects. This constraint +results in a modest simplification in what we need to support. That is, we +don't need to cache the entirety of the request's or response's headers (like +what `http-cache-semantics`) does. Instead, we only need to cache the data +necessary to *make decisions* about HTTP caching. + +One example of this is the `Vary` response header. This requires checking the +the headers listed in a cached response have the same value in the original +request and the new request. If the new request has different values for those +headers (as specified in the cached response) than what was in the original +request, then the new request cannot used our cached response. Normally, this +would seemingly require storing all of the original request's headers. But we +only store the headers listed in the response. + +Also, since we aren't a proxy, there are a host of proxy-specific rules for +managing headers and data that we needn't care about. + +# Zero-copy deserialization + +As mentioned above, we would really like our fast path (that is, a cached +response that we deem "fresh" and thus don't need to send a re-validation +request for) to avoid needing to actually deserialize a `CachePolicy`. While a +`CachePolicy` isn't particularly big, it is in our critical path. Yet, we still +need a `CachePolicy` to be able to decide whether a cached response is still +fresh or not. (This decision procedure is non-trivial, so it *probably* doesn't +make too much sense to hack around it with something simpler.) + +We attempt to achieve this by implementing the `rkyv` traits for all of our +types. This means that if we read a `Vec` from a file, then we can very +cheaply turn that into a `rkyvutil::OwnedArchive`. Creating that +only requires a quick validation step, but is otherwise free. We can then +use that as-if it were an `Archived` (which is an alias for the +`ArchivedCachePolicy` type implicitly introduced by `derive(rkyv::Archive)`). +Crucially, this is why we implement all of our HTTP cache semantics logic on +`ArchivedCachePolicy` and *not* `CachePolicy`. It can be easy to forget this +because `rkyv` does such an amazing job of making its use of archived types +very closely resemble that of the standard types. For example, whenever the +methods below are accessing a field whose type is a `Vec` in the normal type, +what's actually being accessed is a [`rkyv::vec::ArchivedVec`]. Similarly, +for strings, it's [`rkyv::string::ArchivedString`] and not a standard library +`String`. This all works somewhat seamlessly because all of the cache semantics +are generally just read-only operations, but if you stray from the path, you're +likely to get whacked over the head. + +One catch here is that we actually want the HTTP cache semantics to be +available on `CachePolicy` too. At least, at time of writing, we do. To +achieve this `CachePolicy::to_archived` is provided, which will serialize the +`CachePolicy` to its archived representation in bytes, and then turn that +into an `OwnedArchive` which derefs to `ArchivedCachePolicy`. +This is a little extra cost, but the idea is that a `CachePolicy` (not an +`ArchivedCachePolicy`) should only be used in the slower path (i.e., when you +actually need to make an HTTP request). + +[`rkyv::vec::ArchivedVec`]: hhttps://docs.rs/rkyv/0.7.43/rkyv/vec/struct.ArchivedVec.html +[`rkyv::string::ArchivedString`]: https://docs.rs/rkyv/0.7.43/rkyv/string/struct.ArchivedString.html + +# Additional reading + +* Short introduction to `Cache-Control`: https://csswizardry.com/2019/03/cache-control-for-civilians/ +* Caching best practcies: https://jakearchibald.com/2016/caching-best-practices/ +* Overview of HTTP caching: https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching +* MDN docs for `Cache-Control`: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control +* THe 1997 RFC for HTTP 1.1: https://www.rfc-editor.org/rfc/rfc2068#section-13 +* The 1999 update to HTTP 1.1: https://www.rfc-editor.org/rfc/rfc2616.html#section-13 +* The "stale content" cache-control extension: https://httpwg.org/specs/rfc5861.html +* HTTP 1.1 caching (superseded by RFC 9111): https://httpwg.org/specs/rfc7234.html +* The "immutable" cache-control extension: https://httpwg.org/specs/rfc8246.html +* HTTP semantics (If-None-Match, etc.): https://www.rfc-editor.org/rfc/rfc9110#section-8.8.3 +* HTTP caching (obsoletes RFC 7234): https://www.rfc-editor.org/rfc/rfc9111.html +*/ + +use std::time::{Duration, SystemTime}; + +use {http::header::HeaderValue, rkyv::bytecheck}; + +use crate::rkyvutil::OwnedArchive; + +use self::control::CacheControl; + +mod control; + +/// Knobs to configure Puffin's cache behavior. +/// +/// At time of writing, we don't expose any way of modifying these since I +/// suspect we won't ever need to. We split them out into their own type so +/// that they can be shared between `CachePolicyBuilder` and `CachePolicy`. +#[derive(Clone, Debug, rkyv::Archive, rkyv::CheckBytes, rkyv::Deserialize, rkyv::Serialize)] +// Since `CacheConfig` is so simple, we can use itself as the archived type. +// But note that this will fall apart if even something like an Option is +// added. +#[archive(as = "Self")] +#[repr(C)] +struct CacheConfig { + shared: bool, + heuristic_percent: u8, +} + +impl Default for CacheConfig { + fn default() -> CacheConfig { + CacheConfig { + // The caching puffin does ought to be considered + // private. + shared: false, + // This is only used to heuristically guess at a freshness lifetime + // when other indicators (such as `max-age` and `Expires` are + // absent. + heuristic_percent: 10, + } + } +} + +/// A builder for constructing a `CachePolicy`. +/// +/// A builder can be used directly when spawning fresh HTTP requests +/// without a cached response. A builder is also constructed for you via +/// [`CachePolicy::before_request`] when a cached response exists but is stale. +/// +/// The main idea of a builder is that it manages the flow of data needed to +/// construct a `CachePolicy`. That is, you start with an HTTP request, then +/// you get a response and finally a new `CachePolicy`. +#[derive(Debug)] +pub struct CachePolicyBuilder { + /// The configuration controlling the behavior of the cache. + config: CacheConfig, + /// A subset of information from the HTTP request that we will store. This + /// is needed to make future decisions about cache behavior. + request: Request, + /// The full set of request headers. This copy is necessary because the + /// headers are needed in order to correctly capture the values necessary + /// to implement the `Vary` check, as per [RFC 9111 S4.1]. The upside is + /// that this is not actually persisted in a `CachePolicy`. We only need it + /// until we have the response. + /// + /// The precise reason why this copy is intrinsically needed is because + /// sending a request requires ownership of the request. Yet, we don't know + /// which header values we need to store in our cache until we get the + /// response back. Thus, these headers must be persisted until after the + /// point we've given up ownership of the request. + /// + /// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 + request_headers: http::HeaderMap, +} + +impl CachePolicyBuilder { + /// Create a new builder of a cache policy, starting with the request. + pub fn new(request: &reqwest::Request) -> CachePolicyBuilder { + let config = CacheConfig::default(); + let request_headers = request.headers().clone(); + let request = Request::from(request); + CachePolicyBuilder { + config, + request, + request_headers, + } + } + + /// Return a new policy given the response to the request that this builder + /// was created with. + pub fn build(self, response: &reqwest::Response) -> CachePolicy { + let vary = Vary::from_request_response_headers(&self.request_headers, response.headers()); + CachePolicy { + config: self.config, + request: self.request, + response: Response::from(response), + vary, + } + } +} + +/// A value encapsulating the data needed to implement HTTP caching behavior +/// for Puffin. +/// +/// A cache policy is meant to be stored and persisted with the data being +/// cached. It is specifically meant to capture the smallest amount of +/// information needed to determine whether a cached response is stale or not, +/// and the information required to issue a re-validation request. +/// +/// This does not provide a complete set of HTTP cache semantics. Notably +/// absent from this (among other things that Puffin probably doesn't care +/// about it) are proxy cache semantics. +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +pub struct CachePolicy { + /// The configuration controlling the behavior of the cache. + config: CacheConfig, + /// A subset of information from the HTTP request that we will store. This + /// is needed to make future decisions about cache behavior. + request: Request, + /// A subset of information from the HTTP response that we will store. This + /// is needed to make future decisions about cache behavior. + response: Response, + /// This contains the set of vary header names (from the cached response) + /// and the corresponding values (from the original request) used to verify + /// whether a new request can utilize a cached response or not. This is + /// placed outside of `request` and `response` because it contains bits + /// from both! + vary: Vary, +} + +impl CachePolicy { + /// Convert this to an owned archive value. + /// + /// It's necessary to call this in order to make decisions with this cache + /// policy. Namely, all of the cache semantics logic is implemented on the + /// archived types. + /// + /// These do incur an extra cost, but this should only be needed when you + /// don't have an `ArchivedCachePolicy`. And that should only occur when + /// you're actually performing an HTTP request. In that case, the extra + /// cost that is done here to convert a `CachePolicy` to its archived form + /// should be marginal. + pub fn to_archived(&self) -> OwnedArchive { + // There's no way (other than OOM) for serializing this type to fail. + OwnedArchive::from_unarchived(self).expect("all possible values can be archived") + } +} + +impl ArchivedCachePolicy { + /// Determines what caching behavior is correct given an existing + /// `CachePolicy` and a new HTTP request for the resource managed by this + /// cache policy. This is done as per [RFC 9111 S4]. + /// + /// Calling this method conceptually corresponds to asking the following + /// question: "I have a cached response for an incoming HTTP request. May I + /// return that cached response, or do I need to go back to the progenitor + /// of that response to determine whether it's still the latest thing?" + /// + /// This returns one of three possible behaviors: + /// + /// 1. The cached response is still fresh, and the caller may return + /// the cached response without issuing an HTTP requests. + /// 2. The cached response is stale. The caller should send a re-validation + /// request and then call `CachePolicy::after_response` to determine whether + /// the cached response is actually fresh, or if it's stale and needs to + /// be updated. + /// 3. The given request does not match the cache policy identification. + /// Generally speaking, this usually implies a bug with the cache in that + /// it loaded a cache policy that does not match the request. + /// + /// In the case of (2), the given request is modified in place such that + /// it is suitable as a revalidation request. + /// + /// [RFC 9111 S4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4 + pub fn before_request(&self, request: &mut reqwest::Request) -> BeforeRequest { + let now = SystemTime::now(); + // If the response was never storable, then we just bail out + // completely. + if !self.is_storable() { + tracing::trace!( + "request {} does not match cache request {} because it isn't storable", + request.url(), + self.request.uri, + ); + return BeforeRequest::NoMatch; + } + // "When presented with a request, a cache MUST NOT reuse a stored + // response unless..." + // + // "the presented target URI and that of the stored response match, + // and..." + if self.request.uri != request.url().as_str() { + tracing::trace!( + "request {} does not match cache URL of {}", + request.url(), + self.request.uri, + ); + return BeforeRequest::NoMatch; + } + // "the request method associated with the stored response allows it to + // be used for the presented request, and..." + if request.method() != http::Method::GET && request.method() != http::Method::HEAD { + tracing::trace!( + "method {:?} for request {} is not supported by this cache", + request.method(), + request.url(), + ); + return BeforeRequest::NoMatch; + } + // "request header fields nominated by the stored response (if any) + // match those presented, and..." + // + // We don't support the `Vary` header, so if it was set, we + // conservatively require revalidation. + if !self.vary.matches(request.headers()) { + tracing::trace!( + "request {} does not match cached request because of the 'Vary' header", + request.url(), + ); + self.set_revalidation_headers(request); + return BeforeRequest::Stale(self.new_cache_policy_builder(request)); + } + // "the stored response does not contain the no-cache directive, unless + // it is successfully validated, and..." + if self.response.headers.cc.no_cache { + self.set_revalidation_headers(request); + return BeforeRequest::Stale(self.new_cache_policy_builder(request)); + } + // "the stored response is one of the following: ..." + // + // "fresh, or..." + // "allowed to be served stale, or..." + if self.is_fresh(now, request) { + return BeforeRequest::Fresh; + } + // "successfully validated." + // + // In this case, callers will need to send a revalidation request. + self.set_revalidation_headers(request); + BeforeRequest::Stale(self.new_cache_policy_builder(request)) + } + + /// This implements the logic for handling the response to a request that + /// may be a revalidation request, as per [RFC 9111 S4.3.3] and [RFC 9111 + /// S4.3.4]. That is, the cache policy builder given here should be the one + /// returned by `CachePolicy::before_request` with the response received + /// from the origin server for the possibly-revalidating request. + /// + /// Even if the request is new (in that there is no response cached + /// for it), callers may use this routine. But generally speaking, + /// callers are only supposed to use this routine after getting a + /// [`BeforeRequest::Stale`]. + /// + /// The return value indicates whether the cached response is still fresh + /// (that is, `AfterResponse::NotModified`) or if it has changed (that is, + /// `AfterResponse::Modified`). In the latter case, the cached response has + /// been invalidated and the caller should cache the new response. In the + /// former case, the cached response is still considered fresh. + /// + /// In either case, callers should update their cache with the new policy. + /// + /// [RFC 9111 S4.3.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.3 + /// [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + pub fn after_response( + &self, + cache_policy_builder: CachePolicyBuilder, + response: &reqwest::Response, + ) -> AfterResponse { + let mut new_policy = cache_policy_builder.build(response); + if self.is_modified(&new_policy) { + AfterResponse::Modified(new_policy) + } else { + new_policy.response.status = self.response.status; + AfterResponse::NotModified(new_policy) + } + } + + fn is_modified(&self, new_policy: &CachePolicy) -> bool { + // From [RFC 9111 S4.3.3], + // + // "A 304 (Not Modified) response status code indicates that the stored + // response can be updated and reused" + // + // So if we don't get a 304, then we know our cached response is seen + // as stale by the origin server. + // + // [RFC 9111 S4.3.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.3 + if new_policy.response.status != 304 { + return true; + } + // As per [RFC 9111 S4.3.4], we need to confirm that our validators match. Here, + // we check `ETag`. + // + // [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + if let Some(old_etag) = self.response.headers.etag.as_ref() { + if let Some(new_etag) = new_policy.response.headers.etag.as_ref() { + // We don't support weak validators, so only match if they're + // both strong. + if !old_etag.weak && !new_etag.weak && old_etag.value == new_etag.value { + return false; + } + } + } + // As per [RFC 9111 S4.3.4], we need to confirm that our validators match. Here, + // we check `Last-Modified`. + // + // [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + if let Some(old_last_modified) = self.response.headers.last_modified_unix_timestamp.as_ref() + { + if let Some(new_last_modified) = new_policy + .response + .headers + .last_modified_unix_timestamp + .as_ref() + { + if old_last_modified == new_last_modified { + return false; + } + } + } + // As per [RFC 9111 S4.3.4], if we have no validators anywhere, then + // we can just rely on the HTTP 304 status code and reuse the cached + // response. + // + // [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + if self.response.headers.etag.is_none() + && new_policy.response.headers.etag.is_none() + && self.response.headers.last_modified_unix_timestamp.is_none() + && new_policy + .response + .headers + .last_modified_unix_timestamp + .is_none() + { + return false; + } + true + } + + /// Sets the relevant headers on the given request so that it can be used + /// as a revalidation request. As per [RFC 9111 S4.3.1], this permits the + /// origin server to check if the content is different from our cached + /// response. If it isn't, then the origin server can return an HTTP 304 + /// NOT MODIFIED status, which avoids the need to re-transmit the response + /// body. That is, it indicates that our cached response is still fresh. + /// + /// This will always use a strong etag validator if it's present on the + /// cached response. If the given request already has an etag validator + /// on it, this routine will add to it and not replace it. + /// + /// In contrast, if the request already has the `If-Modified-Since` header + /// set, then this will not change or replace it. If it's not set, then one + /// is added if the cached response had a valid `Last-Modified` header. + /// + /// [RFC 9111 S4.3.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.1 + fn set_revalidation_headers(&self, request: &mut reqwest::Request) { + // As per [RFC 9110 13.1.2] and [RFC 9111 S4.3.1], if our stored + // response has an etag, we should send it back via the `If-None-Match` + // header. The idea is that the server should only "do" the request if + // none of the tags match. If there is a match, then the server can + // return HTTP 304 indicating that our stored response is still fresh. + // + // [RFC 9110 S13.1.2]: https://www.rfc-editor.org/rfc/rfc9110#section-13.1.2 + // [RFC 9111 S4.3.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.1 + if let Some(etag) = self.response.headers.etag.as_ref() { + // We don't support weak validation principally because we want to + // be notified if there was a change in the content. Namely, from + // RFC 9110 S13.1.2: "... weak entity tags can be used for cache + // validation even if there have been changes to the representation + // data." + if !etag.weak { + if let Ok(header) = HeaderValue::from_bytes(&etag.value) { + request.headers_mut().append("if-none-match", header); + } + } + } + // We also set `If-Modified-Since` as per [RFC 9110 S13.1.3] and [RFC + // 9111 S4.3.1]. Generally, `If-None-Match` will override this, but we + // set it in case `If-None-Match` is not supported. + // + // [RFC 9110 S13.1.3]: https://www.rfc-editor.org/rfc/rfc9110#section-13.1.3 + // [RFC 9111 S4.3.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.1 + if !request.headers().contains_key("if-modified-since") { + if let Some(&last_modified_unix_timestamp) = + self.response.headers.last_modified_unix_timestamp.as_ref() + { + if let Some(last_modified) = unix_timestamp_to_header(last_modified_unix_timestamp) + { + request + .headers_mut() + .insert("if-modified-since", last_modified); + } + } + } + } + + /// Returns true if and only if the response is storable as per + /// [RFC 9111 S3]. + /// + /// [RFC 9111 S3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-3 + pub fn is_storable(&self) -> bool { + // In the absence of other signals, we are limited to caching responses + // with a code that is heuristically cacheable as per [RFC 9110 S15.1]. + // + // [RFC 9110 S15.1]: https://www.rfc-editor.org/rfc/rfc9110#section-15.1 + const HEURISTICALLY_CACHEABLE_STATUS_CODES: &[u16] = + &[200, 203, 204, 206, 300, 301, 308, 404, 405, 410, 414, 501]; + + // N.B. This routine could be "simpler", but we bias toward + // following the flow of logic as closely as possible as written + // in RFC 9111 S3. + + // "the request method is understood by the cache" + // + // We just don't bother with anything that isn't GET. + if !matches!( + self.request.method, + ArchivedMethod::Get | ArchivedMethod::Head + ) { + tracing::trace!( + "cached request {} is not storable because of its method {:?}", + self.request.uri, + self.request.method + ); + return false; + } + // "the response status code is final" + // + // ... and we'll put more restrictions on status code + // below, but we can bail out early here. + if !self.response.has_final_status() { + tracing::trace!( + "cached request {} is not storable because its response has \ + non-final status code {:?}", + self.request.uri, + self.response.status, + ); + return false; + } + // "if the response status code is 206 or 304, or the must-understand + // cache directive (see Section 5.2.2.3) is present: the cache + // understands the response status code" + // + // We don't currently support `must-understand`. We also don't support + // partial content (206). And 304 not modified shouldn't be cached + // itself. + if self.response.status == 206 || self.response.status == 304 { + tracing::trace!( + "cached request {} is not storable because its response has \ + unsupported status code {:?}", + self.request.uri, + self.response.status, + ); + return false; + } + // "The no-store request directive indicates that a cache MUST NOT + // store any part of either this request or any response to it." + // + // (This is from RFC 9111 S5.2.1.5, and doesn't seem to be mentioned in + // S3.) + if self.request.headers.cc.no_store { + tracing::trace!( + "cached request {} is not storable because its request has \ + a 'no-store' cache-control directive", + self.request.uri, + ); + return false; + } + // "the no-store cache directive is not present in the response" + if self.response.headers.cc.no_store { + tracing::trace!( + "cached request {} is not storable because its response has \ + a 'no-store' cache-control directive", + self.request.uri, + ); + return false; + } + // "if the cache is shared ..." + if self.config.shared { + // "if the cache is shared: the private response directive is either + // not present or allows a shared cache to store a modified response" + // + // We don't support more granular "private" directives (which allow + // caching all of a private HTTP response in a shared cache only after + // removing some subset of the response's headers that are deemed + // private). + if self.response.headers.cc.private { + tracing::trace!( + "cached request {} is not storable because this is a shared \ + cache and its response has a 'private' cache-control directive", + self.request.uri, + ); + return false; + } + // "if the cache is shared: the Authorization header field is not + // present in the request or a response directive is present that + // explicitly allows shared caching" + if self.request.headers.authorization && !self.allows_authorization_storage() { + tracing::trace!( + "cached request {} is not storable because this is a shared \ + cache and the request has an 'Authorization' header set and \ + the response has indicated that caching requests with an \ + 'Authorization' header is allowed", + self.request.uri, + ); + return false; + } + } + + // "the response contains at least one of the following ..." + // + // "a public response directive" + if self.response.headers.cc.public { + tracing::trace!( + "cached request {} is storable because its response has \ + a 'public' cache-control directive", + self.request.uri, + ); + return true; + } + // "a private response directive, if the cache is not shared" + if !self.config.shared && self.response.headers.cc.private { + tracing::trace!( + "cached request {} is storable because this is a shared cache \ + and its response has a 'private' cache-control directive", + self.request.uri, + ); + return true; + } + // "an Expires header field" + if self.response.headers.expires_unix_timestamp.is_some() { + tracing::trace!( + "cached request {} is storable because its response has an \ + 'Expires' header set", + self.request.uri, + ); + return true; + } + // "a max-age response directive" + if self.response.headers.cc.max_age_seconds.is_some() { + tracing::trace!( + "cached request {} is storable because its response has an \ + 'max-age' cache-control directive", + self.request.uri, + ); + return true; + } + // "if the cache is shared: an s-maxage response directive" + if self.config.shared && self.response.headers.cc.s_maxage_seconds.is_some() { + tracing::trace!( + "cached request {} is storable because this is a shared cache \ + and its response has a 's-maxage' cache-control directive", + self.request.uri, + ); + return true; + } + // "a cache extension that allows it to be cached" + // ... we don't support any extensions. + // + // "a status code that is defined as heuristically cacheable" + if HEURISTICALLY_CACHEABLE_STATUS_CODES.contains(&self.response.status) { + tracing::trace!( + "cached request {} is storable because its response has a \ + heuristically cacheable status code {:?}", + self.request.uri, + self.response.status, + ); + return true; + } + tracing::trace!( + "cached response {} is not storable because it does not meet any \ + of the necessary criteria (e.g., it doesn't have an 'Expires' \ + header set or a 'max-age' cache-control directive)", + self.request.uri, + ); + false + } + + /// Returns true when a response is storable even if it has an + /// `Authorization` header, as per [RFC 9111 S3.5]. + /// + /// [RFC 9111 S3.5]: https://www.rfc-editor.org/rfc/rfc9111.html#section-3.5 + fn allows_authorization_storage(&self) -> bool { + self.response.headers.cc.must_revalidate + || self.response.headers.cc.public + || self.response.headers.cc.s_maxage_seconds.is_some() + } + + /// Returns true if the response is considered fresh as per [RFC 9111 + /// S4.2]. If the response is not fresh, then it considered stale and ought + /// to be revalidated with the origin server. + /// + /// [RFC 9111 S4.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2 + fn is_fresh(&self, now: SystemTime, request: &reqwest::Request) -> bool { + let freshness_lifetime = self.freshness_lifetime().as_secs(); + let age = self.age(now).as_secs(); + + // Per RFC 8246, the `immutable` directive means that a reload from an + // end user should not result in a revlalidation request. Indeed, the + // `immutable` directive seems to imply that clients should never talk + // to the origin server until the cached response is stale with respect + // to its freshness lifetime (as set by the server). + // + // A *force* reload from an end user should override this, but we + // currently have no path for that in this implementation. Instead, we + // just interpret `immutable` as meaning that any directives on the + // new request that would otherwise result in sending a revalidation + // request are ignored. + // + // [RFC 8246]: https://httpwg.org/specs/rfc8246.html + if !self.response.headers.cc.immutable { + let reqcc = request + .headers() + .get_all("cache-control") + .iter() + .collect::(); + + // As per [RFC 9111 S5.2.1.4], if the request has `no-cache`, then we should + // respect that. + // + // [RFC 9111 S5.2.1.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.4 + if reqcc.no_cache { + tracing::trace!( + "request {} does not have a fresh cache because \ + it has a 'no-cache' cache-control directive", + request.url(), + ); + return false; + } + + // If the request has a max-age directive, then we should respect that + // as per [RFC 9111 S5.2.1.1]. + // + // [RFC 9111 S5.2.1.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.1 + if let Some(&max_age) = reqcc.max_age_seconds.as_ref() { + if age > max_age { + tracing::trace!( + "request {} does not have a fresh cache because \ + the cached response's age is {} seconds and the max age \ + allowed by the request is {} seconds", + request.url(), + age, + max_age, + ); + return false; + } + } + + // If the request has a min-fresh directive, then we only consider a + // cached response fresh if the remaining time it has to live exceeds + // the threshold provided, as per [RFC 9111 S5.2.1.3]. + // + // [RFC 9111 S5.2.1.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.3 + if let Some(&min_fresh) = reqcc.min_fresh_seconds.as_ref() { + let time_to_live = freshness_lifetime.saturating_sub(unix_timestamp(now)); + if time_to_live < min_fresh { + tracing::trace!( + "request {} does not have a fresh cache because \ + the request set a 'min-fresh' cache-control directive, \ + and its time-to-live is {} seconds but it needs to be \ + at least {} seconds", + request.url(), + time_to_live, + min_fresh, + ); + // Note that S5.2.1.3 does not say that max-stale overrides + // this, so we ignore it here. + return false; + } + } + } + if age > freshness_lifetime { + let allows_stale = self.allows_stale(now); + if !allows_stale { + tracing::trace!( + "request {} does not have a fresh cache because \ + its age is {} seconds, it is greater than the freshness \ + lifetime of {} seconds and stale cached responses are not \ + allowed", + request.url(), + age, + freshness_lifetime, + ); + return false; + } + } + true + } + + /// Returns true if we're allowed to serve a stale response, as per [RFC + /// 9111 S4.2.4]. + /// + /// [RFC 9111 S4.2.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.4 + fn allows_stale(&self, now: SystemTime) -> bool { + // As per [RFC 9111 S5.2.2.2], if `must-revalidate` is present, then + // caches cannot reuse a stale response without talking to the server + // first. Note that RFC 9111 doesn't seem to say anything about the + // interaction between must-revalidate and max-stale, so we assume that + // must-revalidate takes precedent. + // + // [RFC 9111 S5.2.2.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.2.2 + if self.response.headers.cc.must_revalidate { + tracing::trace!( + "cached request {} has a cached response that does not \ + permit staleness because the response has a 'must-revalidate' \ + cache-control directive set", + self.request.uri, + ); + return false; + } + if let Some(&max_stale) = self.request.headers.cc.max_stale_seconds.as_ref() { + // As per [RFC 9111 S5.2.1.2], if the client has max-stale set, + // then stale responses are allowed, but only if they are stale + // within a given threshold. + // + // [RFC 9111 S5.2.1.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.2 + let stale_amount = self + .age(now) + .as_secs() + .saturating_sub(self.freshness_lifetime().as_secs()); + if stale_amount <= max_stale { + tracing::trace!( + "cached request {} has a cached response that allows staleness \ + in this case because the stale amount is {} seconds and the \ + 'max-stale' cache-control directive set by the cached request \ + is {} seconds", + self.request.uri, + stale_amount, + max_stale, + ); + return true; + } + } + // As per [RFC 9111 S4.2.4], we shouldn't use stale responses unless + // we're explicitly allowed to (e.g., via `max-stale` above): + // + // "A cache MUST NOT generate a stale response unless it is + // disconnected or doing so is explicitly permitted by the client or + // origin server..." + // + // [RFC 9111 S4.2.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.4 + tracing::trace!( + "cached request {} has a cached response that does not allow staleness", + self.request.uri, + ); + false + } + + /// Returns the age of the HTTP response as per [RFC 9111 S4.2.3]. + /// + /// The age of a response, essentially, refers to how long it has been + /// since the response was created by the origin server. The age is used + /// to compare with the freshness lifetime of the response to determine + /// whether the response is fresh or stale. + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#name-calculating-age + fn age(&self, now: SystemTime) -> Duration { + // RFC 9111 S4.2.3 + let apparent_age = self + .response + .unix_timestamp + .saturating_sub(self.response.header_date()); + let response_delay = self + .response + .unix_timestamp + .saturating_sub(self.request.unix_timestamp); + let corrected_age_value = self.response.header_age().saturating_add(response_delay); + let corrected_initial_age = apparent_age.max(corrected_age_value); + let resident_age = unix_timestamp(now).saturating_sub(self.response.unix_timestamp); + let current_age = corrected_initial_age + resident_age; + Duration::from_secs(current_age) + } + + /// Returns how long a response should be considered "fresh" as per + /// [RFC 9111 S4.2.1]. When this returns false, the response should be + /// considered stale and the client should revalidate with the server. + /// + /// If there are no indicators of a response's freshness lifetime, then + /// this returns `0`. That is, the response will be considered stale in all + /// cases. + /// + /// [RFC 9111 S4.2.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.1 + fn freshness_lifetime(&self) -> Duration { + if self.config.shared { + if let Some(&s_maxage) = self.response.headers.cc.s_maxage_seconds.as_ref() { + return Duration::from_secs(s_maxage); + } + } + if let Some(&max_age) = self.response.headers.cc.max_age_seconds.as_ref() { + return Duration::from_secs(max_age); + } + if let Some(&expires) = self.response.headers.expires_unix_timestamp.as_ref() { + return Duration::from_secs(expires.saturating_sub(self.response.header_date())); + } + if let Some(&last_modified) = self.response.headers.last_modified_unix_timestamp.as_ref() { + let interval = self.response.header_date().saturating_sub(last_modified); + let percent = u64::from(self.config.heuristic_percent); + return Duration::from_secs(interval.saturating_mul(percent).saturating_div(100)); + } + // Without any indicators as to the freshness lifetime, we act + // conservatively and use a value that will always result in a response + // being treated as stale. + Duration::ZERO + } + + fn new_cache_policy_builder(&self, request: &reqwest::Request) -> CachePolicyBuilder { + let request_headers = request.headers().clone(); + CachePolicyBuilder { + config: self.config.clone(), + request: Request::from(request), + request_headers, + } + } +} + +/// The result of calling [`CachePolicy::before_request`]. +/// +/// This dictates what the caller should do next by indicating whether the +/// cached response is stale or not. +#[derive(Debug)] +pub enum BeforeRequest { + /// The cached response is still fresh, and the caller may return the + /// cached response without issuing an HTTP requests. + Fresh, + /// The cached response is stale. The caller should send a re-validation + /// request and then call `CachePolicy::after_response` to determine + /// whether the cached response is actually fresh, or if it's stale and + /// needs to be updated. + Stale(CachePolicyBuilder), + /// The given request does not match the cache policy identification. + /// Generally speaking, this is usually implies a bug with the cache in + /// that it loaded a cache policy that does not match the request. + NoMatch, +} + +/// The result of called [`CachePolicy::after_response`]. +/// +/// This is meant to report whether a revalidation request was successful or +/// not. If it was, then a `AfterResponse::NotModified` is returned. Otherwise, +/// the server determined the cached response was truly stale and in need of +/// updated. +#[derive(Debug)] +pub enum AfterResponse { + /// The cached response is still fresh. + NotModified(CachePolicy), + /// The cached response has been invalidated and needs to be updated with + /// the new data in the response to the revalidation request. + Modified(CachePolicy), +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct Request { + uri: String, + method: Method, + headers: RequestHeaders, + unix_timestamp: u64, +} + +impl<'a> From<&'a reqwest::Request> for Request { + fn from(from: &'a reqwest::Request) -> Request { + Request { + uri: from.url().to_string(), + method: Method::from(from.method()), + headers: RequestHeaders::from(from.headers()), + unix_timestamp: unix_timestamp(SystemTime::now()), + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct RequestHeaders { + /// The cache control directives from the `Cache-Control` header. + cc: CacheControl, + /// This is set to `true` only when an `Authorization` header is present. + /// We don't need to record the value. + authorization: bool, +} + +impl<'a> From<&'a http::HeaderMap> for RequestHeaders { + fn from(from: &'a http::HeaderMap) -> RequestHeaders { + RequestHeaders { + cc: from.get_all("cache-control").iter().collect(), + authorization: from.contains_key("authorization"), + } + } +} + +/// The HTTP method used on a request. +/// +/// We don't both representing methods of requests whose responses we won't +/// cache. Instead, we treat them as "unrecognized" and consider the responses +/// not-storable. +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +#[repr(u8)] +enum Method { + Get, + Head, + Unrecognized, +} + +impl<'a> From<&'a http::Method> for Method { + fn from(from: &'a http::Method) -> Method { + if from == http::Method::GET { + Method::Get + } else if from == http::Method::HEAD { + Method::Head + } else { + Method::Unrecognized + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct Response { + status: u16, + headers: ResponseHeaders, + unix_timestamp: u64, +} + +impl ArchivedResponse { + /// Returns the "age" header value on this response, with a fallback of `0` + /// if the header doesn't exist or is invalid, as per [RFC 9111 S4.2.3]. + /// + /// Note that this does not reflect the true "age" of a response. That + /// is computed via `ArchivedCachePolicy::age` as it may need additional + /// information (such as the request time). + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.3 + fn header_age(&self) -> u64 { + self.headers.age_seconds.unwrap_or(0) + } + + /// Returns the "date" header value on this response, with a fallback to + /// the time the response was received as per [RFC 9110 S6.6.1]. + /// + /// [RFC 9110 S6.6.1]: https://www.rfc-editor.org/rfc/rfc9110#section-6.6.1 + fn header_date(&self) -> u64 { + self.headers + .date_unix_timestamp + .unwrap_or(self.unix_timestamp) + } + + /// Returns true when this response has a status code that is considered + /// "final" as per [RFC 9110 S15]. + /// + /// [RFC 9110 S15]: https://www.rfc-editor.org/rfc/rfc9110#section-15 + fn has_final_status(&self) -> bool { + self.status >= 200 + } +} + +impl<'a> From<&'a reqwest::Response> for Response { + fn from(from: &'a reqwest::Response) -> Response { + Response { + status: from.status().as_u16(), + headers: ResponseHeaders::from(from.headers()), + unix_timestamp: unix_timestamp(SystemTime::now()), + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct ResponseHeaders { + /// The directives from the `Cache-Control` header. + cc: CacheControl, + /// The value of the `Age` header corresponding to `age_value` as defined + /// in [RFC 9111 S4.2.3]. If the `Age` header is not present, it should be + /// interpreted at `0`. + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#name-calculating-age + age_seconds: Option, + /// This is `date_value` from [RFC 9111 S4.2.3], which says it corresponds + /// to the `Date` header on a response as defined in [RFC 7231 S7.1.1.2]. + /// In RFC 7231, if the `Date` header is not present, then the recipient + /// should treat its value as equivalent to the time the response was + /// received. In this case, that would be `Response::unix_timestamp`. + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#name-calculating-age + /// [RFC 7231 S7.1.1.2]: https://httpwg.org/specs/rfc7231.html#header.date + date_unix_timestamp: Option, + /// This is from the `Expires` header as per [RFC 9111 S5.3]. Note that this + /// is overridden by the presence of either the `max-age` or `s-maxage` cache + /// control directives. + /// + /// If an `Expires` header was present but did not contain a valid RFC 2822 + /// datetime, then this is set to `Some(0)`. (That is, some time in the + /// past, which implies the response has already expired.) + /// + /// [RFC 9111 S5.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.3 + expires_unix_timestamp: Option, + /// The date from the `Last-Modified` header as specified in [RFC 9110 S8.8.2] + /// in RFC 2822 format. It's used to compute a heuristic freshness lifetime for + /// the response when other indicators are missing as per [RFC 9111 S4.2.2]. + /// + /// [RFC 9110 S8.8.2]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.2 + /// [RFC 9111 S4.2.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.2 + last_modified_unix_timestamp: Option, + /// The "entity tag" from the response as per [RFC 9110 S8.8.3], which is + /// used in revalidation requests. + /// + /// [RFC 9110 S8.8.3]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.3 + etag: Option, +} + +impl<'a> From<&'a http::HeaderMap> for ResponseHeaders { + fn from(from: &'a http::HeaderMap) -> ResponseHeaders { + ResponseHeaders { + cc: from.get_all("cache-control").iter().collect(), + age_seconds: from + .get("age") + .and_then(|header| parse_seconds(header.as_bytes())), + date_unix_timestamp: from + .get("date") + .and_then(|header| header.to_str().ok()) + .and_then(rfc2822_to_unix_timestamp), + expires_unix_timestamp: from + .get("expires") + .and_then(|header| header.to_str().ok()) + .and_then(rfc2822_to_unix_timestamp), + last_modified_unix_timestamp: from + .get("last-modified") + .and_then(|header| header.to_str().ok()) + .and_then(rfc2822_to_unix_timestamp), + etag: from + .get("etag") + .map(|header| ETag::parse(header.as_bytes())), + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct ETag { + /// The actual ETag validator value. + /// + /// This is received in the response, recorded as part of the cache policy + /// and then sent back in a re-validation request. This is the "best" + /// way for an HTTP server to return an HTTP 304 NOT MODIFIED status, + /// indicating that our cached response is still fresh. + value: Vec, + /// When `weak` is true, this etag is considered a "weak" validator. In + /// effect, it provides weaker semantics than a "strong" validator. As per + /// [RFC 9110 S8.8.1]: + /// + /// "In contrast, a "weak validator" is representation metadata that might + /// not change for every change to the representation data. This weakness + /// might be due to limitations in how the value is calculated (e.g., + /// clock resolution), an inability to ensure uniqueness for all possible + /// representations of the resource, or a desire of the resource owner to + /// group representations by some self-determined set of equivalency rather + /// than unique sequences of data." + /// + /// We don't currently support weak validation. + /// + /// [RFC 9110 S8.8.1]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.1-6 + weak: bool, +} + +impl ETag { + /// Parses an ETag from a header value. + /// + /// We are a little permissive here and allow arbitrary bytes, + /// where as [RFC 9110 S8.8.3] is a bit more restrictive. + /// + /// [RFC 9110 S8.8.3]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.3 + fn parse(header_value: &[u8]) -> ETag { + let (value, weak) = if header_value.starts_with(b"W/") { + (&header_value[2..], true) + } else { + (header_value, false) + }; + ETag { + value: value.to_vec(), + weak, + } + } +} + +/// Represents the `Vary` header on a cached response, as per [RFC 9110 +/// S12.5.5] and [RFC 9111 S4.1]. +/// +/// This permits responses from the server to express things like, "only used +/// an existing cached response if the request from the client has the same +/// header values for the headers listed in `Vary` as in the original request." +/// +/// [RFC 9110 S12.5.5]: https://www.rfc-editor.org/rfc/rfc9110#section-12.5.5 +/// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct Vary { + fields: Vec, +} + +impl Vary { + /// Returns a `Vary` header value that will never match any request. + fn always_fails_to_match() -> Vary { + Vary { + fields: vec![VaryField { + name: "*".to_string(), + value: vec![], + }], + } + } + + fn from_request_response_headers( + request: &http::HeaderMap, + response: &http::HeaderMap, + ) -> Vary { + // Parses the `Vary` header as per [RFC 9110 S12.5.5]. + // + // [RFC 9110 S12.5.5]: https://www.rfc-editor.org/rfc/rfc9110#section-12.5.5 + let mut fields = vec![]; + for header in response.get_all("vary") { + let Ok(csv) = header.to_str() else { continue }; + for header_name in csv.split(',') { + let header_name = header_name.trim().to_ascii_lowercase(); + // When we see a `*`, that means a failed match is an + // inevitability, regardless of anything else. So just give up + // and return a `Vary` that will never match. + if header_name == "*" { + return Vary::always_fails_to_match(); + } + let value = request + .get(&header_name) + .map(|header| header.as_bytes().to_vec()) + .unwrap_or_default(); + fields.push(VaryField { + name: header_name, + value, + }); + } + } + Vary { fields } + } +} + +impl ArchivedVary { + /// Returns true only when the `Vary` header on a cached response satisfies + /// the request header values given, as per [RFC 9111 S4.1]. + /// + /// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 + fn matches(&self, request_headers: &http::HeaderMap) -> bool { + for field in self.fields.iter() { + // A `*` anywhere means the match always fails. + if field.name == "*" { + return false; + } + let request_header_value = request_headers + .get(field.name.as_str()) + .map_or(&b""[..], |header| header.as_bytes()); + if field.value.as_slice() != request_header_value { + return false; + } + } + true + } +} + +/// A single field and value in a `Vary` header set by the response, +/// as per [RFC 9111 S4.1]. +/// +/// The `name` of the field comes from the `Vary` header in the response, +/// while the value of the field comes from the value of the header with the +/// same `name` in the original request. These field and value pairs are then +/// compared with new incoming requests. If there is a mismatch, then the +/// cached response cannot be used. +/// +/// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct VaryField { + name: String, + value: Vec, +} + +fn unix_timestamp(time: SystemTime) -> u64 { + time.duration_since(SystemTime::UNIX_EPOCH) + .expect("UNIX_EPOCH is as early as it gets") + .as_secs() +} + +fn rfc2822_to_unix_timestamp(s: &str) -> Option { + rfc2822_to_datetime(s).and_then(|dt| u64::try_from(dt.timestamp()).ok()) +} + +fn rfc2822_to_datetime(s: &str) -> Option> { + chrono::DateTime::parse_from_rfc2822(s) + .ok() + .map(|dt| dt.to_utc()) +} + +fn unix_timestamp_to_header(seconds: u64) -> Option { + unix_timestamp_to_rfc2822(seconds).and_then(|string| HeaderValue::from_str(&string).ok()) +} + +fn unix_timestamp_to_rfc2822(seconds: u64) -> Option { + unix_timestamp_to_datetime(seconds).map(|dt| dt.to_rfc2822()) +} + +fn unix_timestamp_to_datetime(seconds: u64) -> Option> { + chrono::DateTime::from_timestamp(i64::try_from(seconds).ok()?, 0) +} + +fn parse_seconds(value: &[u8]) -> Option { + if !value.iter().all(u8::is_ascii_digit) { + return None; + } + std::str::from_utf8(value).ok()?.parse().ok() +} diff --git a/crates/puffin-client/src/lib.rs b/crates/puffin-client/src/lib.rs index ebf20d587..ecc4917d9 100644 --- a/crates/puffin-client/src/lib.rs +++ b/crates/puffin-client/src/lib.rs @@ -7,11 +7,11 @@ pub use registry_client::{ }; pub use rkyvutil::OwnedArchive; -mod cache_headers; mod cached_client; mod error; mod flat_index; mod html; +mod httpcache; mod registry_client; mod remote_metadata; mod rkyvutil; diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index f440b4512..5fdc909b1 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -27,6 +27,7 @@ use pypi_types::{Metadata21, SimpleJson}; use crate::cached_client::CacheControl; use crate::html::SimpleHtml; use crate::remote_metadata::wheel_metadata_from_remote_zip; +use crate::rkyvutil::OwnedArchive; use crate::{CachedClient, CachedClientError, Error, ErrorKind}; /// A builder for an [`RegistryClient`]. @@ -122,7 +123,7 @@ impl RegistryClient { pub async fn simple( &self, package_name: &PackageName, - ) -> Result<(IndexUrl, SimpleMetadata), Error> { + ) -> Result<(IndexUrl, OwnedArchive), Error> { if self.index_urls.no_index() { return Err(ErrorKind::NoIndex(package_name.as_ref().to_string()).into()); } @@ -152,7 +153,7 @@ impl RegistryClient { &self, package_name: &PackageName, index: &IndexUrl, - ) -> Result>, Error> { + ) -> Result, CachedClientError>, Error> { // Format the URL for PyPI. let mut url: Url = index.clone().into(); url.path_segments_mut() @@ -168,7 +169,7 @@ impl RegistryClient { IndexUrl::Pypi => "pypi".to_string(), IndexUrl::Url(url) => cache_key::digest(&cache_key::CanonicalUrl::new(url)), }), - format!("{package_name}.msgpack"), + format!("{package_name}.rkyv"), ); let cache_control = CacheControl::from( self.cache @@ -201,14 +202,14 @@ impl RegistryClient { )) })?; - match media_type { + let unarchived = match media_type { MediaType::Json => { let bytes = response.bytes().await.map_err(ErrorKind::RequestError)?; let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) .map_err(|err| Error::from_json_err(err, url.clone()))?; let metadata = SimpleMetadata::from_files(data.files, package_name, url.as_str()); - Ok(metadata) + metadata } MediaType::Html => { let text = response.text().await.map_err(ErrorKind::RequestError)?; @@ -216,16 +217,17 @@ impl RegistryClient { .map_err(|err| Error::from_html_err(err, url.clone()))?; let metadata = SimpleMetadata::from_files(files, package_name, base.as_url().as_str()); - Ok(metadata) + metadata } - } + }; + OwnedArchive::from_unarchived(&unarchived) } .boxed() .instrument(info_span!("parse_simple_api", package = %package_name)) }; let result = self .client - .get_cached_with_callback( + .get_cacheable( simple_request, &cache_entry, cache_control, @@ -339,7 +341,7 @@ impl RegistryClient { .map_err(ErrorKind::RequestError)?; Ok(self .client - .get_cached_with_callback(req, &cache_entry, cache_control, response_callback) + .get_serde(req, &cache_entry, cache_control, response_callback) .await?) } else { // If we lack PEP 658 support, try using HTTP range requests to read only the @@ -399,7 +401,7 @@ impl RegistryClient { .map_err(ErrorKind::RequestError)?; let result = self .client - .get_cached_with_callback( + .get_serde( req, &cache_entry, cache_control, diff --git a/crates/puffin-dev/src/resolve_many.rs b/crates/puffin-dev/src/resolve_many.rs index 1c37b2b5b..3f510a8fa 100644 --- a/crates/puffin-dev/src/resolve_many.rs +++ b/crates/puffin-dev/src/resolve_many.rs @@ -15,7 +15,7 @@ use pep440_rs::{Version, VersionSpecifier, VersionSpecifiers}; use pep508_rs::{Requirement, VersionOrUrl}; use platform_host::Platform; use puffin_cache::{Cache, CacheArgs}; -use puffin_client::{FlatIndex, RegistryClient, RegistryClientBuilder}; +use puffin_client::{FlatIndex, OwnedArchive, RegistryClient, RegistryClientBuilder}; use puffin_dispatch::BuildDispatch; use puffin_installer::NoBinary; use puffin_interpreter::Virtualenv; @@ -48,7 +48,8 @@ async fn find_latest_version( client: &RegistryClient, package_name: &PackageName, ) -> Option { - let (_, simple_metadata) = client.simple(package_name).await.ok()?; + let (_, raw_simple_metadata) = client.simple(package_name).await.ok()?; + let simple_metadata = OwnedArchive::deserialize(&raw_simple_metadata); let version = simple_metadata.into_iter().next()?.version; Some(version.clone()) } diff --git a/crates/puffin-distribution/src/distribution_database.rs b/crates/puffin-distribution/src/distribution_database.rs index 883d4807c..cf781667b 100644 --- a/crates/puffin-distribution/src/distribution_database.rs +++ b/crates/puffin-distribution/src/distribution_database.rs @@ -177,7 +177,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let archive = self .client .cached_client() - .get_cached_with_callback(req, &http_entry, cache_control, download) + .get_serde(req, &http_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, @@ -240,7 +240,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let archive = self .client .cached_client() - .get_cached_with_callback(req, &http_entry, cache_control, download) + .get_serde(req, &http_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, diff --git a/crates/puffin-distribution/src/source/mod.rs b/crates/puffin-distribution/src/source/mod.rs index d1ad59103..7e3464ae9 100644 --- a/crates/puffin-distribution/src/source/mod.rs +++ b/crates/puffin-distribution/src/source/mod.rs @@ -273,7 +273,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { let req = self.cached_client.uncached().get(url.clone()).build()?; let manifest = self .cached_client - .get_cached_with_callback(req, &cache_entry, cache_control, download) + .get_serde(req, &cache_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, @@ -366,7 +366,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { let req = self.cached_client.uncached().get(url.clone()).build()?; let manifest = self .cached_client - .get_cached_with_callback(req, &cache_entry, cache_control, download) + .get_serde(req, &cache_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, @@ -941,10 +941,11 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { /// Read an existing HTTP-cached [`Manifest`], if it exists. pub(crate) fn read_http_manifest(cache_entry: &CacheEntry) -> Result, Error> { - match std::fs::read(cache_entry.path()) { - Ok(cached) => Ok(Some( - rmp_serde::from_slice::>(&cached)?.data, - )), + match std::fs::File::open(cache_entry.path()) { + Ok(file) => { + let data = DataWithCachePolicy::from_reader(file)?.data; + Ok(Some(rmp_serde::from_slice::(&data)?)) + } Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), Err(err) => Err(Error::CacheRead(err)), } diff --git a/crates/puffin-resolver/Cargo.toml b/crates/puffin-resolver/Cargo.toml index a716a75ea..4f87cabfb 100644 --- a/crates/puffin-resolver/Cargo.toml +++ b/crates/puffin-resolver/Cargo.toml @@ -40,7 +40,6 @@ dashmap = { workspace = true } derivative = { workspace = true } fs-err = { workspace = true, features = ["tokio"] } futures = { workspace = true } -http-cache-semantics = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } once_cell = { workspace = true } diff --git a/crates/puffin-resolver/src/finder.rs b/crates/puffin-resolver/src/finder.rs index 33412cdfb..93bb8ac77 100644 --- a/crates/puffin-resolver/src/finder.rs +++ b/crates/puffin-resolver/src/finder.rs @@ -12,7 +12,7 @@ use distribution_types::{Dist, IndexUrl, Resolution}; use pep508_rs::{Requirement, VersionOrUrl}; use platform_tags::Tags; use puffin_client::{ - FlatDistributions, FlatIndex, RegistryClient, SimpleMetadata, SimpleMetadatum, + FlatDistributions, FlatIndex, OwnedArchive, RegistryClient, SimpleMetadata, SimpleMetadatum, }; use puffin_interpreter::Interpreter; use puffin_normalize::PackageName; @@ -67,7 +67,8 @@ impl<'a> DistFinder<'a> { match requirement.version_or_url.as_ref() { None | Some(VersionOrUrl::VersionSpecifier(_)) => { // Query the index(es) (cached) to get the URLs for the available files. - let (index, metadata) = self.client.simple(&requirement.name).await?; + let (index, raw_metadata) = self.client.simple(&requirement.name).await?; + let metadata = OwnedArchive::deserialize(&raw_metadata); // Pick a version that satisfies the requirement. let Some(dist) = self.select(requirement, metadata, &index, flat_index) else { diff --git a/crates/puffin-resolver/src/version_map.rs b/crates/puffin-resolver/src/version_map.rs index 2838deffa..8d32f0048 100644 --- a/crates/puffin-resolver/src/version_map.rs +++ b/crates/puffin-resolver/src/version_map.rs @@ -8,7 +8,7 @@ use distribution_filename::DistFilename; use distribution_types::{Dist, IndexUrl, PrioritizedDistribution, ResolvableDist}; use pep440_rs::Version; use platform_tags::Tags; -use puffin_client::{FlatDistributions, SimpleMetadata, SimpleMetadatum}; +use puffin_client::{FlatDistributions, OwnedArchive, SimpleMetadata, SimpleMetadatum}; use puffin_normalize::PackageName; use puffin_traits::NoBinary; use puffin_warnings::warn_user_once; @@ -26,7 +26,7 @@ impl VersionMap { #[instrument(skip_all, fields(package_name))] #[allow(clippy::too_many_arguments)] pub(crate) fn from_metadata( - metadata: SimpleMetadata, + raw_metadata: OwnedArchive, package_name: &PackageName, index: &IndexUrl, tags: &Tags, @@ -36,6 +36,15 @@ impl VersionMap { flat_index: Option, no_binary: &NoBinary, ) -> Self { + // NOTE: We should experiment with refactoring the code + // below to work on rkyv::Archived. More + // specifically, we may want to adjust VersionMap itself to + // contain an Archived of some kind, that in + // turn is used in the resolver. The idea here is to avoid + // eagerly deserializing all of the metadata for a package + // up-front. + let metadata = OwnedArchive::deserialize(&raw_metadata); + // If we have packages of the same name from find links, gives them priority, otherwise start empty let mut version_map: BTreeMap = flat_index.map(Into::into).unwrap_or_default();