diff --git a/Cargo.lock b/Cargo.lock index 00452ae51..d1a32d60d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1361,34 +1361,12 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "http-cache-semantics" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aec9f678bca3f4a15194b980f20ed9bfe0dd38e8d298c65c559a93dfbd6380a" -dependencies = [ - "http", - "http-serde", - "serde", - "time", -] - [[package]] name = "http-content-range" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f0d1a8ef218a86416107794b34cc446958d9203556c312bb41eab4c924c1d2e" -[[package]] -name = "http-serde" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f560b665ad9f1572cfcaf034f7fb84338a7ce945216d64a90fd81f046a3caee" -dependencies = [ - "http", - "serde", -] - [[package]] name = "httparse" version = "1.8.0" @@ -2584,7 +2562,6 @@ dependencies = [ "futures", "html-escape", "http", - "http-cache-semantics", "insta", "install-wheel-rs", "pep440_rs", @@ -2868,7 +2845,6 @@ dependencies = [ "fs-err", "futures", "gourgeist", - "http-cache-semantics", "indexmap 2.2.2", "insta", "install-wheel-rs", diff --git a/Cargo.toml b/Cargo.toml index 33f4b8b02..7c5538f25 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,7 +50,6 @@ hmac = { version = "0.12.1" } home = { version = "0.5.9" } html-escape = { version = "0.2.13" } http = { version = "0.2.11" } -http-cache-semantics = { version = "1.0.2" } indexmap = { version = "2.1.0" } indicatif = { version = "0.17.7" } indoc = { version = "2.0.4" } diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index d453c3858..65f9837a3 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -402,7 +402,7 @@ pub enum CacheBucket { /// The following requirements: /// ```text /// # git source dist - /// pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git + /// pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git /// # pypi source dist /// django_allauth==0.51.0 /// # url source dist @@ -488,8 +488,8 @@ pub enum CacheBucket { /// Index responses through the simple metadata API. /// /// Cache structure: - /// * `simple-v0/pypi/.msgpack` - /// * `simple-v0//.msgpack` + /// * `simple-v0/pypi/.rkyv` + /// * `simple-v0//.rkyv` /// /// The response is parsed into `puffin_client::SimpleMetadata` before storage. Simple, @@ -575,15 +575,15 @@ impl CacheBucket { } } CacheBucket::Simple => { - // For `pypi` wheels, we expect a MsgPack file per package, indexed by name. + // For `pypi` wheels, we expect a rkyv file per package, indexed by name. let root = cache.bucket(self).join(WheelCacheKind::Pypi); - summary += rm_rf(root.join(format!("{name}.msgpack")))?; + summary += rm_rf(root.join(format!("{name}.rkyv")))?; // For alternate indices, we expect a directory for every index, followed by a // MsgPack file per package, indexed by name. let root = cache.bucket(self).join(WheelCacheKind::Url); for directory in directories(root) { - summary += rm_rf(directory.join(format!("{name}.msgpack")))?; + summary += rm_rf(directory.join(format!("{name}.rkyv")))?; } } CacheBucket::FlatIndex => { diff --git a/crates/puffin-client/Cargo.toml b/crates/puffin-client/Cargo.toml index 1b5b3e686..b26015ceb 100644 --- a/crates/puffin-client/Cargo.toml +++ b/crates/puffin-client/Cargo.toml @@ -23,7 +23,6 @@ fs-err = { workspace = true, features = ["tokio"] } futures = { workspace = true } html-escape = { workspace = true } http = { workspace = true } -http-cache-semantics = { workspace = true } reqwest = { workspace = true } reqwest-middleware = { workspace = true } reqwest-retry = { workspace = true } diff --git a/crates/puffin-client/src/cache_headers.rs b/crates/puffin-client/src/cache_headers.rs deleted file mode 100644 index 3728a6ed6..000000000 --- a/crates/puffin-client/src/cache_headers.rs +++ /dev/null @@ -1,56 +0,0 @@ -use http::HeaderValue; -use rustc_hash::FxHashMap; -use std::collections::hash_map::Entry; - -/// Cache headers from an HTTP response. -#[derive(Debug, Default)] -pub(crate) struct CacheHeaders(FxHashMap, Option>>); - -impl CacheHeaders { - /// Parse the `Cache-Control` header from an HTTP response. - /// - /// See: - pub(crate) fn from_response<'a>( - headers: impl IntoIterator, - ) -> CacheHeaders { - let mut cc = FxHashMap::, Option>>::default(); - let mut is_valid = true; - - for h in headers.into_iter().filter_map(|v| v.to_str().ok()) { - for part in h.split(',') { - // TODO: lame parsing - if part.trim().is_empty() { - continue; - } - let mut kv = part.splitn(2, '='); - let k = kv.next().unwrap().trim(); - if k.is_empty() { - continue; - } - let v = kv.next().map(str::trim); - match cc.entry(k.into()) { - Entry::Occupied(e) => { - // When there is more than one value present for a given directive (e.g., two Expires header fields, multiple Cache-Control: max-age directives), - // the directive's value is considered invalid. Caches are encouraged to consider responses that have invalid freshness information to be stale - if e.get().as_deref() != v { - is_valid = false; - } - } - Entry::Vacant(e) => { - e.insert(v.map(|v| v.trim_matches('"')).map(From::from)); - // TODO: bad unquoting - } - } - } - } - if !is_valid { - cc.insert("must-revalidate".into(), None); - } - Self(cc) - } - - /// Returns `true` if the response has an `immutable` directive. - pub(crate) fn is_immutable(&self) -> bool { - self.0.contains_key("immutable") - } -} diff --git a/crates/puffin-client/src/cached_client.rs b/crates/puffin-client/src/cached_client.rs index 4c82ae31b..b4420ac82 100644 --- a/crates/puffin-client/src/cached_client.rs +++ b/crates/puffin-client/src/cached_client.rs @@ -1,20 +1,98 @@ -use std::future::Future; -use std::time::SystemTime; +use std::{borrow::Cow, future::Future, path::Path}; use futures::FutureExt; -use http::request::Parts; -use http_cache_semantics::{AfterResponse, BeforeRequest, CachePolicy}; use reqwest::{Request, Response}; use reqwest_middleware::ClientWithMiddleware; +use rkyv::util::AlignedVec; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tracing::{debug, info_span, instrument, trace, warn, Instrument}; -use url::Url; use puffin_cache::{CacheEntry, Freshness}; use puffin_fs::write_atomic; -use crate::{cache_headers::CacheHeaders, Error, ErrorKind}; +use crate::{ + httpcache::{AfterResponse, BeforeRequest, CachePolicy, CachePolicyBuilder}, + rkyvutil::OwnedArchive, + Error, ErrorKind, +}; + +/// A trait the generalizes (de)serialization at a high level. +/// +/// The main purpose of this trait is to make the `CachedClient` work for +/// either serde or other mechanisms of serialization such as `rkyv`. +/// +/// If you're using Serde, then unless you want to control the format, callers +/// should just use `CachedClient::get_serde`. This will use a default +/// implementation of `Cacheable` internally. +/// +/// Alternatively, callers using `rkyv` should use +/// `CachedClient::get_cacheable`. If your types fit into the +/// `rkyvutil::OwnedArchive` mold, then an implementation of `Cacheable` is +/// already provided for that type. +pub trait Cacheable: Sized + Send { + /// This associated type permits customizing what the "output" type of + /// deserialization is. It can be identical to `Self`. + /// + /// Typical use of this is for wrapper types used to proviate blanket trait + /// impls without hitting overlapping impl problems. + type Target; + + /// Deserialize a value from bytes aligned to a 16-byte boundary. + fn from_aligned_bytes(bytes: AlignedVec) -> Result; + /// Serialize bytes to a possibly owned byte buffer. + fn to_bytes(&self) -> Result, crate::Error>; + /// Convert this type into its final form. + fn into_target(self) -> Self::Target; +} + +/// A wrapper type that makes anything with Serde support automatically +/// implement `Cacheable`. +#[derive(Debug, Deserialize, Serialize)] +#[serde(transparent)] +pub struct SerdeCacheable { + inner: T, +} + +impl Cacheable for SerdeCacheable { + type Target = T; + + fn from_aligned_bytes(bytes: AlignedVec) -> Result { + Ok(rmp_serde::from_slice::(&bytes).map_err(ErrorKind::Decode)?) + } + + fn to_bytes(&self) -> Result, Error> { + Ok(Cow::from( + rmp_serde::to_vec(&self.inner).map_err(ErrorKind::Encode)?, + )) + } + + fn into_target(self) -> Self::Target { + self.inner + } +} + +/// All `OwnedArchive` values are cacheable. +impl Cacheable for OwnedArchive +where + A: rkyv::Archive + rkyv::Serialize> + Send, + A::Archived: for<'a> rkyv::CheckBytes> + + rkyv::Deserialize, +{ + type Target = OwnedArchive; + + fn from_aligned_bytes(bytes: AlignedVec) -> Result, Error> { + OwnedArchive::new(bytes) + } + + fn to_bytes(&self) -> Result, Error> { + Ok(Cow::from(OwnedArchive::as_bytes(self))) + } + + fn into_target(self) -> Self::Target { + self + } +} /// Either a cached client error or a (user specified) error from the callback #[derive(Debug)] @@ -44,285 +122,6 @@ impl> From> for Error { } } -#[derive(Debug)] -enum CachedResponse { - /// The cached response is fresh without an HTTP request (e.g. immutable) - FreshCache(Payload), - /// The cached response is fresh after an HTTP request (e.g. 304 not modified) - NotModified(DataWithCachePolicy), - /// There was no prior cached response or the cache was outdated - /// - /// The cache policy is `None` if it isn't storable - ModifiedOrNew(Response, Option>), -} - -/// Serialize the actual payload together with its caching information. -#[derive(Debug, Deserialize, Serialize)] -pub struct DataWithCachePolicy { - pub data: Payload, - /// Whether the response should be considered immutable. - immutable: bool, - /// The [`CachePolicy`] is used to determine if the response is fresh or stale. - /// The policy is large (448 bytes at time of writing), so we reduce the stack size by - /// boxing it. - cache_policy: Box, -} - -/// Custom caching layer over [`reqwest::Client`] using `http-cache-semantics`. -/// -/// The implementation takes inspiration from the `http-cache` crate, but adds support for running -/// an async callback on the response before caching. We use this to e.g. store a -/// parsed version of the wheel metadata and for our remote zip reader. In the latter case, we want -/// to read a single file from a remote zip using range requests (so we don't have to download the -/// entire file). We send a HEAD request in the caching layer to check if the remote file has -/// changed (and if range requests are supported), and in the callback we make the actual range -/// requests if required. -/// -/// Unlike `http-cache`, all outputs must be serde-able. Currently everything is json, but we can -/// transparently switch to a faster/smaller format. -/// -/// Again unlike `http-cache`, the caller gets full control over the cache key with the assumption -/// that it's a file. -#[derive(Debug, Clone)] -pub struct CachedClient(ClientWithMiddleware); - -impl CachedClient { - pub fn new(client: ClientWithMiddleware) -> Self { - Self(client) - } - - /// The middleware is the retry strategy - pub fn uncached(&self) -> ClientWithMiddleware { - self.0.clone() - } - - /// Make a cached request with a custom response transformation - /// - /// If a new response was received (no prior cached response or modified on the remote), the - /// response is passed through `response_callback` and only the result is cached and returned. - /// The `response_callback` is allowed to make subsequent requests, e.g. through the uncached - /// client. - #[instrument(skip_all)] - pub async fn get_cached_with_callback< - Payload: Serialize + DeserializeOwned + Send + 'static, - CallBackError, - Callback, - CallbackReturn, - >( - &self, - req: Request, - cache_entry: &CacheEntry, - cache_control: CacheControl, - response_callback: Callback, - ) -> Result> - where - Callback: FnOnce(Response) -> CallbackReturn, - CallbackReturn: Future> + Send, - { - let cached = Self::read_cache(cache_entry).await; - - let cached_response = self.send_cached(req, cache_control, cached).boxed().await?; - - let write_cache = info_span!("write_cache", file = %cache_entry.path().display()); - match cached_response { - CachedResponse::FreshCache(data) => Ok(data), - CachedResponse::NotModified(data_with_cache_policy) => { - async { - let data = - rmp_serde::to_vec(&data_with_cache_policy).map_err(ErrorKind::Encode)?; - write_atomic(cache_entry.path(), data) - .await - .map_err(ErrorKind::CacheWrite)?; - Ok(data_with_cache_policy.data) - } - .instrument(write_cache) - .await - } - CachedResponse::ModifiedOrNew(res, cache_policy) => { - let headers = CacheHeaders::from_response(res.headers().get_all("cache-control")); - let immutable = headers.is_immutable(); - - let data = response_callback(res) - .boxed() - .await - .map_err(|err| CachedClientError::Callback(err))?; - if let Some(cache_policy) = cache_policy { - let data_with_cache_policy = DataWithCachePolicy { - data, - immutable, - cache_policy, - }; - async { - fs_err::tokio::create_dir_all(cache_entry.dir()) - .await - .map_err(ErrorKind::CacheWrite)?; - let data = rmp_serde::to_vec(&data_with_cache_policy) - .map_err(ErrorKind::Encode)?; - write_atomic(cache_entry.path(), data) - .await - .map_err(ErrorKind::CacheWrite)?; - Ok(data_with_cache_policy.data) - } - .instrument(write_cache) - .await - } else { - Ok(data) - } - } - } - } - - async fn read_cache( - cache_entry: &CacheEntry, - ) -> Option> { - let read_span = info_span!("read_cache", file = %cache_entry.path().display()); - let read_result = fs_err::tokio::read(cache_entry.path()) - .instrument(read_span) - .await; - - if let Ok(cached) = read_result { - let parse_span = info_span!( - "parse_cache", - path = %cache_entry.path().display() - ); - let parse_result = tokio::task::spawn_blocking(move || { - parse_span - .in_scope(|| rmp_serde::from_slice::>(&cached)) - }) - .await - .expect("Tokio executor failed, was there a panic?"); - match parse_result { - Ok(data) => Some(data), - Err(err) => { - warn!( - "Broken cache entry at {}, removing: {err}", - cache_entry.path().display() - ); - let _ = fs_err::tokio::remove_file(&cache_entry.path()).await; - None - } - } - } else { - None - } - } - - /// `http-cache-semantics` to `reqwest` wrapper - async fn send_cached( - &self, - mut req: Request, - cache_control: CacheControl, - cached: Option>, - ) -> Result, Error> { - let url = req.url().clone(); - let cached_response = if let Some(cached) = cached { - // Avoid sending revalidation requests for immutable responses. - if cached.immutable && !cached.cache_policy.is_stale(SystemTime::now()) { - debug!("Found immutable response for: {url}"); - return Ok(CachedResponse::FreshCache(cached.data)); - } - - // Apply the cache control header, if necessary. - match cache_control { - CacheControl::None => {} - CacheControl::MustRevalidate => { - req.headers_mut().insert( - http::header::CACHE_CONTROL, - http::HeaderValue::from_static("max-age=0, must-revalidate"), - ); - } - } - - match cached - .cache_policy - .before_request(&RequestLikeReqwest(&req), SystemTime::now()) - { - BeforeRequest::Fresh(_) => { - debug!("Found fresh response for: {url}"); - CachedResponse::FreshCache(cached.data) - } - BeforeRequest::Stale { request, matches } => { - self.send_cached_handle_stale(req, url, cached, &request, matches) - .await? - } - } - } else { - debug!("No cache entry for: {url}"); - self.fresh_request(req).await? - }; - Ok(cached_response) - } - - async fn send_cached_handle_stale( - &self, - mut req: Request, - url: Url, - cached: DataWithCachePolicy, - request: &Parts, - matches: bool, - ) -> Result, Error> { - if !matches { - // This shouldn't happen; if it does, we'll override the cache. - warn!("Cached request doesn't match current request for: {url}"); - return self.fresh_request(req).await; - } - - debug!("Sending revalidation request for: {url}"); - for header in &request.headers { - req.headers_mut().insert(header.0.clone(), header.1.clone()); - } - let res = self - .0 - .execute(req.try_clone().expect("streaming requests not supported")) - .instrument(info_span!("revalidation_request", url = url.as_str())) - .await - .map_err(ErrorKind::RequestMiddlewareError)? - .error_for_status() - .map_err(ErrorKind::RequestError)?; - let after_response = cached.cache_policy.after_response( - &RequestLikeReqwest(&req), - &ResponseLikeReqwest(&res), - SystemTime::now(), - ); - match after_response { - AfterResponse::NotModified(new_policy, _parts) => { - debug!("Found not-modified response for: {url}"); - let headers = CacheHeaders::from_response(res.headers().get_all("cache-control")); - let immutable = headers.is_immutable(); - Ok(CachedResponse::NotModified(DataWithCachePolicy { - data: cached.data, - immutable, - cache_policy: Box::new(new_policy), - })) - } - AfterResponse::Modified(new_policy, _parts) => { - debug!("Found modified response for: {url}"); - Ok(CachedResponse::ModifiedOrNew( - res, - new_policy.is_storable().then(|| Box::new(new_policy)), - )) - } - } - } - - #[instrument(skip_all, fields(url = req.url().as_str()))] - async fn fresh_request(&self, req: Request) -> Result, Error> { - trace!("{} {}", req.method(), req.url()); - let res = self - .0 - .execute(req.try_clone().expect("streaming requests not supported")) - .await - .map_err(ErrorKind::RequestMiddlewareError)? - .error_for_status() - .map_err(ErrorKind::RequestError)?; - let cache_policy = CachePolicy::new(&RequestLikeReqwest(&req), &ResponseLikeReqwest(&res)); - Ok(CachedResponse::ModifiedOrNew( - res, - cache_policy.is_storable().then(|| Box::new(cache_policy)), - )) - } -} - #[derive(Debug, Clone, Copy)] pub enum CacheControl { /// Respect the `cache-control` header from the response. @@ -341,44 +140,515 @@ impl From for CacheControl { } } -#[derive(Debug)] -struct RequestLikeReqwest<'a>(&'a Request); +/// Custom caching layer over [`reqwest::Client`]. +/// +/// The implementation takes inspiration from the `http-cache` crate, but adds support for running +/// an async callback on the response before caching. We use this to e.g. store a +/// parsed version of the wheel metadata and for our remote zip reader. In the latter case, we want +/// to read a single file from a remote zip using range requests (so we don't have to download the +/// entire file). We send a HEAD request in the caching layer to check if the remote file has +/// changed (and if range requests are supported), and in the callback we make the actual range +/// requests if required. +/// +/// Unlike `http-cache`, all outputs must be serializable/deserializable in some way, by +/// implementing the `Cacheable` trait. +/// +/// Again unlike `http-cache`, the caller gets full control over the cache key with the assumption +/// that it's a file. +#[derive(Debug, Clone)] +pub struct CachedClient(ClientWithMiddleware); -impl<'a> http_cache_semantics::RequestLike for RequestLikeReqwest<'a> { - fn uri(&self) -> http::uri::Uri { - // This converts from a url::Url (as returned by reqwest::Request::url) - // to a http::uri::Uri. The conversion requires parsing, but this is - // only called ~once per HTTP request. We can afford it. - self.0 - .url() - .as_str() - .parse() - .expect("reqwest::Request::url always returns a valid URL") +impl CachedClient { + pub fn new(client: ClientWithMiddleware) -> Self { + Self(client) } - fn is_same_uri(&self, other: &http::uri::Uri) -> bool { - // At time of writing, I saw no way to cheaply compare a http::uri::Uri - // with a url::Url. We can at least avoid parsing anything, and - // Url::as_str() is free. In practice though, this routine is called - // ~once per HTTP request. We can afford it. (And it looks like - // http::uri::Uri's PartialEq implementation has been tuned.) - self.0.url().as_str() == *other + + /// The middleware is the retry strategy + pub fn uncached(&self) -> ClientWithMiddleware { + self.0.clone() } - fn method(&self) -> &http::method::Method { - self.0.method() + + /// Make a cached request with a custom response transformation + /// while using serde to (de)serialize cached responses. + /// + /// If a new response was received (no prior cached response or modified + /// on the remote), the response is passed through `response_callback` and + /// only the result is cached and returned. The `response_callback` is + /// allowed to make subsequent requests, e.g. through the uncached client. + #[instrument(skip_all)] + pub async fn get_serde< + Payload: Serialize + DeserializeOwned + Send + 'static, + CallBackError, + Callback, + CallbackReturn, + >( + &self, + req: Request, + cache_entry: &CacheEntry, + cache_control: CacheControl, + response_callback: Callback, + ) -> Result> + where + Callback: FnOnce(Response) -> CallbackReturn + Send, + CallbackReturn: Future> + Send, + { + let payload = self + .get_cacheable(req, cache_entry, cache_control, move |resp| async { + let payload = response_callback(resp).await?; + Ok(SerdeCacheable { inner: payload }) + }) + .await?; + Ok(payload) } - fn headers(&self) -> &http::header::HeaderMap { - self.0.headers() + + /// Make a cached request with a custom response transformation while using + /// the `Cacheable` trait to (de)serialize cached responses. + /// + /// The purpose of this routine is the use of `Cacheable`. Namely, it + /// generalizes over (de)serialization such that mechanisms other than + /// serde (such as rkyv) can be used to manage (de)serialization of cached + /// data. + /// + /// If a new response was received (no prior cached response or modified + /// on the remote), the response is passed through `response_callback` and + /// only the result is cached and returned. The `response_callback` is + /// allowed to make subsequent requests, e.g. through the uncached client. + #[instrument(skip_all)] + pub async fn get_cacheable( + &self, + req: Request, + cache_entry: &CacheEntry, + cache_control: CacheControl, + response_callback: Callback, + ) -> Result> + where + Callback: FnOnce(Response) -> CallbackReturn, + CallbackReturn: Future> + Send, + { + let cached_response = match Self::read_cache(cache_entry).await { + Some(cached) => self.send_cached(req, cache_control, cached).boxed().await?, + None => { + debug!("No cache entry for: {}", req.url()); + self.fresh_request(req).await? + } + }; + match cached_response { + CachedResponse::FreshCache(cached) => Ok(Payload::from_aligned_bytes(cached.data)?), + CachedResponse::NotModified { cached, new_policy } => { + let refresh_cache = + info_span!("refresh_cache", file = %cache_entry.path().display()); + async { + let data_with_cache_policy_bytes = + DataWithCachePolicy::serialize(&new_policy, &cached.data)?; + write_atomic(cache_entry.path(), data_with_cache_policy_bytes) + .await + .map_err(ErrorKind::CacheWrite)?; + Ok(Payload::from_aligned_bytes(cached.data)?) + } + .instrument(refresh_cache) + .await + } + CachedResponse::ModifiedOrNew { + response, + cache_policy, + } => { + let new_cache = info_span!("new_cache", file = %cache_entry.path().display()); + let data = response_callback(response) + .boxed() + .await + .map_err(|err| CachedClientError::Callback(err))?; + let Some(cache_policy) = cache_policy else { + return Ok(data.into_target()); + }; + async { + fs_err::tokio::create_dir_all(cache_entry.dir()) + .await + .map_err(ErrorKind::CacheWrite)?; + let data_with_cache_policy_bytes = + DataWithCachePolicy::serialize(&cache_policy, &data.to_bytes()?)?; + write_atomic(cache_entry.path(), data_with_cache_policy_bytes) + .await + .map_err(ErrorKind::CacheWrite)?; + Ok(data.into_target()) + } + .instrument(new_cache) + .await + } + } + } + + async fn read_cache(cache_entry: &CacheEntry) -> Option { + let span = info_span!("read_and_parse_cache", file = %cache_entry.path().display()); + match span + .in_scope(|| DataWithCachePolicy::from_path_async(cache_entry.path())) + .await + { + Ok(data) => Some(data), + Err(err) => { + warn!( + "Broken cache entry at {}, removing: {err}", + cache_entry.path().display() + ); + let _ = fs_err::tokio::remove_file(&cache_entry.path()).await; + None + } + } + } + + /// Send a request given that we have a (possibly) stale cached response. + /// + /// If the cached response is valid but stale, then this will attempt a + /// revalidation request. + async fn send_cached( + &self, + mut req: Request, + cache_control: CacheControl, + cached: DataWithCachePolicy, + ) -> Result { + // Apply the cache control header, if necessary. + match cache_control { + CacheControl::None => {} + CacheControl::MustRevalidate => { + req.headers_mut().insert( + http::header::CACHE_CONTROL, + http::HeaderValue::from_static("no-cache"), + ); + } + } + Ok(match cached.cache_policy.before_request(&mut req) { + BeforeRequest::Fresh => { + debug!("Found fresh response for: {}", req.url()); + CachedResponse::FreshCache(cached) + } + BeforeRequest::Stale(new_cache_policy_builder) => { + debug!("Found stale response for: {}", req.url()); + self.send_cached_handle_stale(req, cached, new_cache_policy_builder) + .await? + } + BeforeRequest::NoMatch => { + // This shouldn't happen; if it does, we'll override the cache. + warn!( + "Cached request doesn't match current request for: {}", + req.url() + ); + self.fresh_request(req).await? + } + }) + } + + async fn send_cached_handle_stale( + &self, + req: Request, + cached: DataWithCachePolicy, + new_cache_policy_builder: CachePolicyBuilder, + ) -> Result { + let url = req.url().clone(); + debug!("Sending revalidation request for: {url}"); + let response = self + .0 + .execute(req) + .instrument(info_span!("revalidation_request", url = url.as_str())) + .await + .map_err(ErrorKind::RequestMiddlewareError)? + .error_for_status() + .map_err(ErrorKind::RequestError)?; + match cached + .cache_policy + .after_response(new_cache_policy_builder, &response) + { + AfterResponse::NotModified(new_policy) => { + debug!("Found not-modified response for: {url}"); + Ok(CachedResponse::NotModified { + cached, + new_policy: Box::new(new_policy), + }) + } + AfterResponse::Modified(new_policy) => { + debug!("Found modified response for: {url}"); + Ok(CachedResponse::ModifiedOrNew { + response, + cache_policy: new_policy + .to_archived() + .is_storable() + .then(|| Box::new(new_policy)), + }) + } + } + } + + #[instrument(skip_all, fields(url = req.url().as_str()))] + async fn fresh_request(&self, req: Request) -> Result { + trace!("Sending fresh {} request for {}", req.method(), req.url()); + let cache_policy_builder = CachePolicyBuilder::new(&req); + let response = self + .0 + .execute(req) + .await + .map_err(ErrorKind::RequestMiddlewareError)? + .error_for_status() + .map_err(ErrorKind::RequestError)?; + let cache_policy = cache_policy_builder.build(&response); + Ok(CachedResponse::ModifiedOrNew { + response, + cache_policy: cache_policy + .to_archived() + .is_storable() + .then(|| Box::new(cache_policy)), + }) } } #[derive(Debug)] -struct ResponseLikeReqwest<'a>(&'a Response); +enum CachedResponse { + /// The cached response is fresh without an HTTP request (e.g. age < max-age). + FreshCache(DataWithCachePolicy), + /// The cached response is fresh after an HTTP request (e.g. 304 not modified) + NotModified { + /// The cached response (with its old cache policy). + cached: DataWithCachePolicy, + /// The new [`CachePolicy`] is used to determine if the response + /// is fresh or stale when making subsequent requests for the same + /// resource. This policy should overwrite the old policy associated + /// with the cached response. In particular, this new policy is derived + /// from data received in a revalidation response, which might change + /// the parameters of cache behavior. + /// + /// The policy is large (352 bytes at time of writing), so we reduce + /// the stack size by boxing it. + new_policy: Box, + }, + /// There was no prior cached response or the cache was outdated + /// + /// The cache policy is `None` if it isn't storable + ModifiedOrNew { + /// The response received from the server. + response: Response, + /// The [`CachePolicy`] is used to determine if the response is fresh or + /// stale when making subsequent requests for the same resource. + /// + /// The policy is large (352 bytes at time of writing), so we reduce + /// the stack size by boxing it. + cache_policy: Option>, + }, +} -impl<'a> http_cache_semantics::ResponseLike for ResponseLikeReqwest<'a> { - fn status(&self) -> http::status::StatusCode { - self.0.status() +/// Represents an arbitrary data blob with an associated HTTP cache policy. +/// +/// The cache policy is used to determine whether the data blob is stale or +/// not. +/// +/// # Format +/// +/// This type encapsulates the format for how blobs of data are stored on +/// disk. The format is very simple. First, the blob of data is written as-is. +/// Second, the archived representation of a `CachePolicy` is written. Thirdly, +/// the length, in bytes, of the archived `CachePolicy` is written as a 64-bit +/// little endian integer. +/// +/// Reading the format is done via an `AlignedVec` so that `rkyv` can correctly +/// read the archived representation of the data blob. The cache policy is +/// split into its own `AlignedVec` allocation. +/// +/// # Future ideas +/// +/// This format was also chosen because it should in theory permit rewriting +/// the cache policy without needing to rewrite the data blob if the blob has +/// not changed. For example, this case occurs when a revalidation request +/// responds with HTTP 304 NOT MODIFIED. At time of writing, this is not yet +/// implemented because 1) the synchronization specifics of mutating a cache +/// file have not been worked out and 2) it's not clear if it's a win. +/// +/// An alternative format would be to write the cache policy and the +/// blob in two distinct files. This would avoid needing to worry about +/// synchronization, but it means reading two files instead of one for every +/// cached response in the fast path. It's unclear whether it's worth it. +/// (Experiments have not yet been done.) +/// +/// Another approach here would be to memory map the file and rejigger +/// `OwnedArchive` (or create a new type) that works with a memory map instead +/// of an `AlignedVec`. This will require care to ensure alignment is handled +/// correctly. This approach has not been litigated yet. I did not start with +/// it because experiments with ripgrep have tended to show that (on Linux) +/// memory mapping a bunch of small files ends up being quite a bit slower than +/// just reading them on to the heap. +#[derive(Debug)] +pub struct DataWithCachePolicy { + pub data: AlignedVec, + cache_policy: OwnedArchive, +} + +impl DataWithCachePolicy { + /// Loads cached data and its associated HTTP cache policy from the given + /// file path in an asynchronous fashion (via `spawn_blocking`). + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format or if reading the + /// file given fails, then this returns an error. + async fn from_path_async(path: &Path) -> Result { + let path = path.to_path_buf(); + tokio::task::spawn_blocking(move || DataWithCachePolicy::from_path_sync(&path)) + .await + // This just forwards panics from the closure. + .unwrap() } - fn headers(&self) -> &http::header::HeaderMap { - self.0.headers() + + /// Loads cached data and its associated HTTP cache policy from the given + /// file path in a synchronous fashion. + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format or if reading the + /// file given fails, then this returns an error. + fn from_path_sync(path: &Path) -> Result { + let file = fs_err::File::open(path).map_err(ErrorKind::Io)?; + // Note that we don't wrap our file in a buffer because it will just + // get passed to AlignedVec::extend_from_reader, which doesn't benefit + // from an intermediary buffer. In effect, the AlignedVec acts as the + // buffer. + DataWithCachePolicy::from_reader(file) + } + + /// Loads cached data and its associated HTTP cache policy from the given + /// reader. + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format or if the reader + /// fails, then this returns an error. + pub fn from_reader(mut rdr: impl std::io::Read) -> Result { + let mut aligned_bytes = rkyv::util::AlignedVec::new(); + aligned_bytes + .extend_from_reader(&mut rdr) + .map_err(ErrorKind::Io)?; + DataWithCachePolicy::from_aligned_bytes(aligned_bytes) + } + + /// Loads cached data and its associated HTTP cache policy form an in + /// memory byte buffer. + /// + /// # Errors + /// + /// If the given byte buffer is not in a valid format, then this + /// returns an error. + fn from_aligned_bytes(mut bytes: AlignedVec) -> Result { + let cache_policy = DataWithCachePolicy::deserialize_cache_policy(&mut bytes)?; + Ok(DataWithCachePolicy { + data: bytes, + cache_policy, + }) + } + + /// Serializes the given cache policy and arbitrary data blob to an in + /// memory byte buffer. + /// + /// # Errors + /// + /// If there was a problem converting the given cache policy to its + /// serialized representation, then this routine will return an error. + fn serialize(cache_policy: &CachePolicy, data: &[u8]) -> Result, Error> { + let mut buf = vec![]; + DataWithCachePolicy::serialize_to_writer(cache_policy, data, &mut buf)?; + Ok(buf) + } + + /// Serializes the given cache policy and arbitrary data blob to the given + /// writer. + /// + /// # Errors + /// + /// If there was a problem converting the given cache policy to its + /// serialized representation or if the writer returns an error, then + /// this routine will return an error. + fn serialize_to_writer( + cache_policy: &CachePolicy, + data: &[u8], + mut wtr: impl std::io::Write, + ) -> Result<(), Error> { + let cache_policy_archived = OwnedArchive::from_unarchived(cache_policy)?; + let cache_policy_bytes = OwnedArchive::as_bytes(&cache_policy_archived); + wtr.write_all(data).map_err(ErrorKind::Io)?; + wtr.write_all(cache_policy_bytes).map_err(ErrorKind::Io)?; + let len = u64::try_from(cache_policy_bytes.len()).map_err(|_| { + let msg = format!( + "failed to represent {} (length of cache policy) in a u64", + cache_policy_bytes.len() + ); + ErrorKind::Io(std::io::Error::other(msg)) + })?; + wtr.write_all(&len.to_le_bytes()).map_err(ErrorKind::Io)?; + Ok(()) + } + + /// Deserializes a `OwnedArchive` off the end of the given + /// aligned bytes. Upon success, the given bytes will only contain the + /// data itself. The bytes representing the cached policy will have been + /// removed. + /// + /// # Errors + /// + /// This returns an error if the cache policy could not be deserialized + /// from the end of the given bytes. + fn deserialize_cache_policy( + bytes: &mut AlignedVec, + ) -> Result, Error> { + let len = DataWithCachePolicy::deserialize_cache_policy_len(bytes)?; + let cache_policy_bytes_start = bytes.len() - (len + 8); + let cache_policy_bytes = &bytes[cache_policy_bytes_start..][..len]; + let mut cache_policy_bytes_aligned = AlignedVec::with_capacity(len); + cache_policy_bytes_aligned.extend_from_slice(cache_policy_bytes); + assert!( + cache_policy_bytes_start <= bytes.len(), + "slicing cache policy should result in a truncation" + ); + // Technically this will keep the extra capacity used to store the + // cache policy around. But it should be pretty small, and it saves a + // realloc. (It's unclear whether that matters more or less than the + // extra memory usage.) + bytes.resize(cache_policy_bytes_start, 0); + OwnedArchive::new(cache_policy_bytes_aligned) + } + + /// Deserializes the length, in bytes, of the cache policy given a complete + /// serialized byte buffer of a `DataWithCachePolicy`. + /// + /// Upon success, callers are guaranteed that + /// `&bytes[bytes.len() - (len + 8)..][..len]` will not panic. + /// + /// # Errors + /// + /// This returns an error if the length could not be read as a `usize` or is + /// otherwise known to be invalid. (For example, it is a length that is bigger + /// than `bytes.len()`.) + fn deserialize_cache_policy_len(bytes: &[u8]) -> Result { + let Some(cache_policy_len_start) = bytes.len().checked_sub(8) else { + let msg = format!( + "data-with-cache-policy buffer should be at least 8 bytes \ + in length, but is {} bytes", + bytes.len(), + ); + return Err(ErrorKind::ArchiveRead(msg).into()); + }; + let cache_policy_len_bytes = <[u8; 8]>::try_from(&bytes[cache_policy_len_start..]) + .expect("cache policy length is 8 bytes"); + let len_u64 = u64::from_le_bytes(cache_policy_len_bytes); + let Ok(len_usize) = usize::try_from(len_u64) else { + let msg = format!( + "data-with-cache-policy has cache policy length of {}, \ + but overflows usize", + len_u64, + ); + return Err(ErrorKind::ArchiveRead(msg).into()); + }; + if bytes.len() < len_usize + 8 { + let msg = format!( + "invalid cache entry: data-with-cache-policy has cache policy length of {}, \ + but total buffer size is {}", + len_usize, + bytes.len(), + ); + return Err(ErrorKind::ArchiveRead(msg).into()); + } + Ok(len_usize) } } diff --git a/crates/puffin-client/src/flat_index.rs b/crates/puffin-client/src/flat_index.rs index f62e01409..816fbd2bb 100644 --- a/crates/puffin-client/src/flat_index.rs +++ b/crates/puffin-client/src/flat_index.rs @@ -132,7 +132,7 @@ impl<'a> FlatIndexClient<'a> { .instrument(info_span!("parse_flat_index_html", url = % url)) }; let files = cached_client - .get_cached_with_callback( + .get_serde( flat_index_request, &cache_entry, cache_control, diff --git a/crates/puffin-client/src/httpcache/control.rs b/crates/puffin-client/src/httpcache/control.rs new file mode 100644 index 000000000..343494540 --- /dev/null +++ b/crates/puffin-client/src/httpcache/control.rs @@ -0,0 +1,775 @@ +use std::collections::HashSet; + +use crate::rkyvutil::OwnedArchive; + +/// Represents values for relevant cache-control directives. +/// +/// This does include some directives that we don't use mostly because they are +/// trivial to support. (For example, `must-understand` at time of writing is +/// not used in our HTTP cache semantics. Neither is `proxy-revalidate` since +/// we are not a proxy.) +#[derive( + Clone, + Debug, + Default, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + rkyv::Archive, + rkyv::Deserialize, + rkyv::Serialize, +)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +pub struct CacheControl { + // directives for requests and responses + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-max-age + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-max-age-2 + pub max_age_seconds: Option, + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-cache + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-cache-2 + pub no_cache: bool, + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-store + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-store-2 + pub no_store: bool, + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-transform + /// * https://www.rfc-editor.org/rfc/rfc9111.html#name-no-transform-2 + pub no_transform: bool, + + // request-only directives + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-max-stale + pub max_stale_seconds: Option, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-min-fresh + pub min_fresh_seconds: Option, + + // response-only directives + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-only-if-cached + pub only_if_cached: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-must-revalidate + pub must_revalidate: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-must-understand + pub must_understand: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-private + pub private: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-proxy-revalidate + pub proxy_revalidate: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-public + pub public: bool, + /// https://www.rfc-editor.org/rfc/rfc9111.html#name-s-maxage + pub s_maxage_seconds: Option, + /// https://httpwg.org/specs/rfc8246.html + pub immutable: bool, +} + +impl CacheControl { + /// Convert this to an owned archive value. + pub fn to_archived(&self) -> OwnedArchive { + // There's no way (other than OOM) for serializing this type to fail. + OwnedArchive::from_unarchived(self).expect("all possible values can be archived") + } +} + +impl<'b, B: 'b + ?Sized + AsRef<[u8]>> FromIterator<&'b B> for CacheControl { + fn from_iter>(it: T) -> CacheControl { + CacheControl::from_iter(CacheControlParser::new(it)) + } +} + +impl FromIterator for CacheControl { + fn from_iter>(it: T) -> CacheControl { + fn parse_int(value: &[u8]) -> Option { + if !value.iter().all(u8::is_ascii_digit) { + return None; + } + std::str::from_utf8(value).ok()?.parse().ok() + } + + let mut cc = CacheControl::default(); + for ccd in it { + // Note that when we see invalid directive values, we follow [RFC + // 9111 S4.2.1]. It says that invalid cache-control directives + // should result in treating the response as stale. (Which we + // accomplished by setting `must_revalidate` to `true`.) + // + // [RFC 9111 S4.2.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.1 + match &*ccd.name { + // request + response directives + "max-age" => match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.max_age_seconds = Some(seconds), + }, + "no-cache" => cc.no_cache = true, + "no-store" => cc.no_store = true, + "no-transform" => cc.no_transform = true, + // request-only directives + "max-stale" => { + // As per [RFC 9111 S5.2.1.2], "If no value is assigned to + // max-stale, then the client will accept a stale response + // of any age." We implement that by just using the maximum + // number of seconds. + // + // [RFC 9111 S5.2.1.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.2 + if ccd.value.is_empty() { + cc.max_stale_seconds = Some(u64::MAX); + } else { + match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.max_stale_seconds = Some(seconds), + } + } + } + "min-fresh" => match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.min_fresh_seconds = Some(seconds), + }, + "only-if-cached" => cc.only_if_cached = true, + "must-revalidate" => cc.must_revalidate = true, + "must-understand" => cc.must_understand = true, + "private" => cc.private = true, + "proxy-revalidate" => cc.proxy_revalidate = true, + "public" => cc.public = true, + "s-maxage" => match parse_int(&ccd.value) { + None => cc.must_revalidate = true, + Some(seconds) => cc.s_maxage_seconds = Some(seconds), + }, + "immutable" => cc.immutable = true, + _ => {} + } + } + cc + } +} + +/// A parser for the HTTP `Cache-Control` header. +/// +/// The parser is mostly defined across multiple parts of multiple RFCs. +/// Namely, [RFC 9110 S5.6.2] says how to parse the names (or "keys") of each +/// directive (whose format is a "token"). [RFC 9110 S5.6.4] says how to parse +/// quoted values. And finally, [RFC 9111 Appendex A] gives the ABNF for the +/// overall header value. +/// +/// This parser accepts an iterator of anything that can be cheaply converted +/// to a byte string (e.g., `http::header::HeaderValue`). Directives are parsed +/// from zero or more of these byte strings. Parsing cannot return an error, +/// but if something unexpected is found, the rest of that header value is +/// skipped. +/// +/// Duplicate directives provoke an automatic insertion of `must-revalidate`, +/// as implied by [RFC 9111 S4.2.1], to ensure that the client will talk to the +/// server before using anything in case. +/// +/// This parser handles a bit more than what we actually need in +/// `puffin-client`. For example, we don't need to handle quoted values at all +/// since either don't use or care about values that require quoted. With that +/// said, the parser handles these because it wasn't that much extra work to do +/// so and just generally seemed like good sense. (If we didn't handle them and +/// parsed them incorrectly, that might mean parsing subsequent directives that +/// we do care about incorrectly.) +/// +/// [RFC 9110 S5.6.2]: https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens +/// [RFC 9110 S5.6.4]: https://www.rfc-editor.org/rfc/rfc9110.html#name-quoted-strings +/// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf +/// [RFC 9111 S4.2.1]: https://www.rfc-editor.org/rfc/rfc9111.html#calculating.freshness.lifetime +struct CacheControlParser<'b, I> { + cur: &'b [u8], + directives: I, + seen: HashSet, +} + +impl<'b, B: 'b + ?Sized + AsRef<[u8]>, I: Iterator> CacheControlParser<'b, I> { + /// Create a new parser of zero or more `Cache-Control` header values. The + /// given iterator should yield elements that satisfy `AsRef<[u8]>`. + fn new>(headers: II) -> CacheControlParser<'b, I> { + let mut directives = headers.into_iter(); + let cur = directives.next().map(|h| h.as_ref()).unwrap_or(b""); + CacheControlParser { + cur, + directives, + seen: HashSet::new(), + } + } + + /// Parses a token according to [RFC 9110 S5.6.2]. + /// + /// If no token is found at the current position, then this returns `None`. + /// Usually this indicates an invalid cache-control directive. + /// + /// This does not trim whitespace before or after the token. + /// + /// [RFC 9110 S5.6.2]: https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens + fn parse_token(&mut self) -> Option { + /// Returns true when the given byte can appear in a token, as + /// defined by [RFC 9110 S5.6.2]. + /// + /// [RFC 9110 S5.6.2]: https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens + fn is_token_byte(byte: u8) -> bool { + matches!( + byte, + | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' + | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~' + | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z', + ) + } + let mut end = 0; + while self.cur.get(end).copied().map_or(false, is_token_byte) { + end += 1; + } + if end == 0 { + None + } else { + let (token, rest) = self.cur.split_at(end); + self.cur = rest; + // This can't fail because `end` is only incremented when the + // current byte is a valid token byte. And all valid token bytes + // are ASCII and thus valid UTF-8. + Some(String::from_utf8(token.to_vec()).expect("all valid token bytes are valid UTF-8")) + } + } + + /// Looks for an `=` as per [RFC 9111 Appendix A] which indicates that a + /// cache directive has a value. + /// + /// This returns true if one was found. In which case, the `=` is consumed. + /// + /// This does not trim whitespace before or after the token. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn maybe_parse_equals(&mut self) -> bool { + if self.cur.first().map_or(false, |&byte| byte == b'=') { + self.cur = &self.cur[1..]; + true + } else { + false + } + } + + /// Parses a directive value as either an unquoted token or a quoted string + /// as per [RFC 9111 Appendix A]. + /// + /// If a valid value could not be found (for example, end-of-input or an + /// opening quote without a closing quote), then `None` is returned. In + /// this case, one should consider the cache-control header invalid. + /// + /// This does not trim whitespace before or after the token. + /// + /// Note that the returned value is *not* guaranteed to be valid UTF-8. + /// Namely, it is possible for a quoted string to contain invalid UTF-8. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn parse_value(&mut self) -> Option> { + if *self.cur.first()? == b'"' { + self.cur = &self.cur[1..]; + self.parse_quoted_string() + } else { + self.parse_token().map(|s| s.into_bytes()) + } + } + + /// Parses a quoted string as per [RFC 9110 S5.6.4]. + /// + /// This assumes the opening quote has already been consumed. + /// + /// If an invalid quoted string was found (e.g., no closing quote), then + /// `None` is returned. An empty value may be returned. + /// + /// Note that the returned value is *not* guaranteed to be valid UTF-8. + /// Namely, it is possible for a quoted string to contain invalid UTF-8. + /// + /// [RFC 9110 S5.6.4]: https://www.rfc-editor.org/rfc/rfc9110.html#name-quoted-strings + fn parse_quoted_string(&mut self) -> Option> { + fn is_qdtext_byte(byte: u8) -> bool { + matches!(byte, b'\t' | b' ' | 0x21 | 0x23..=0x5B | 0x5D..=0x7E | 0x80..=0xFF) + } + fn is_quoted_pair_byte(byte: u8) -> bool { + matches!(byte, b'\t' | b' ' | 0x21..=0x7E | 0x80..=0xFF) + } + let mut value = vec![]; + while !self.cur.is_empty() { + let byte = self.cur[0]; + self.cur = &self.cur[1..]; + if byte == b'"' { + return Some(value); + } else if byte == b'\\' { + let byte = *self.cur.first()?; + self.cur = &self.cur[1..]; + // If we saw an escape but didn't see a valid + // escaped byte, then we treat this value as + // invalid. + if !is_quoted_pair_byte(byte) { + return None; + } + value.push(byte); + } else if is_qdtext_byte(byte) { + value.push(byte); + } else { + break; + } + } + // If we got here, it means we hit end-of-input before seeing a closing + // quote. So we treat this as invalid and return `None`. + None + } + + /// Looks for a `,` as per [RFC 9111 Appendix A]. If one is found, then it + /// is consumed and this returns true. + /// + /// This does not trim whitespace before or after the token. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn maybe_parse_directive_delimiter(&mut self) -> bool { + if self.cur.first().map_or(false, |&byte| byte == b',') { + self.cur = &self.cur[1..]; + true + } else { + false + } + } + + /// [RFC 9111 Appendix A] says that optional whitespace may appear between + /// cache directives. We actually also allow whitespace to appear before + /// the first directive and after the last directive. + /// + /// [RFC 9111 Appendix A]: https://www.rfc-editor.org/rfc/rfc9111.html#name-collected-abnf + fn skip_whitespace(&mut self) { + while self.cur.first().map_or(false, u8::is_ascii_whitespace) { + self.cur = &self.cur[1..]; + } + } + + fn emit_directive( + &mut self, + directive: CacheControlDirective, + ) -> Option { + let duplicate = !self.seen.insert(directive.name.clone()); + if duplicate { + self.emit_revalidation() + } else { + Some(directive) + } + } + + fn emit_revalidation(&mut self) -> Option { + if self.seen.insert("must-revalidate".to_string()) { + Some(CacheControlDirective::must_revalidate()) + } else { + // If we've already emitted a must-revalidate + // directive, then don't do it again. + None + } + } +} + +impl<'b, B: 'b + ?Sized + AsRef<[u8]>, I: Iterator> Iterator + for CacheControlParser<'b, I> +{ + type Item = CacheControlDirective; + + fn next(&mut self) -> Option { + loop { + if self.cur.is_empty() { + self.cur = self.directives.next().map(|h| h.as_ref())?; + } + while !self.cur.is_empty() { + self.skip_whitespace(); + let Some(mut name) = self.parse_token() else { + // If we fail to parse a token, then this header value is + // either corrupt or empty. So skip the rest of it. + let invalid = !self.cur.is_empty(); + self.cur = b""; + // But if it was invalid, force revalidation. + if invalid { + if let Some(d) = self.emit_revalidation() { + return Some(d); + } + } + break; + }; + name.make_ascii_lowercase(); + if !self.maybe_parse_equals() { + // Eat up whitespace and the next delimiter. We don't care + // if we find a terminator. + self.skip_whitespace(); + self.maybe_parse_directive_delimiter(); + let directive = CacheControlDirective { + name, + value: vec![], + }; + match self.emit_directive(directive) { + None => continue, + Some(d) => return Some(d), + } + } + let Some(value) = self.parse_value() else { + // If we expected a value (we saw an =) but couldn't find a + // valid value, then this header value is probably corrupt. + // So skip the rest of it. + self.cur = b""; + match self.emit_revalidation() { + None => break, + Some(d) => return Some(d), + } + }; + // Eat up whitespace and the next delimiter. We don't care if + // we find a terminator. + self.skip_whitespace(); + self.maybe_parse_directive_delimiter(); + let directive = CacheControlDirective { name, value }; + match self.emit_directive(directive) { + None => continue, + Some(d) => return Some(d), + } + } + } + } +} + +/// A single directive from the `Cache-Control` header. +#[derive(Debug, Eq, PartialEq)] +struct CacheControlDirective { + /// The name of the directive. + name: String, + /// A possibly empty value. + /// + /// Note that directive values may contain invalid UTF-8. (Although they + /// cannot actually contain arbitrary bytes. For example, NUL bytes, among + /// others, are not allowed.) + value: Vec, +} + +impl CacheControlDirective { + /// Returns a `must-revalidate` directive. This is useful for forcing a + /// cache decision that the response is stale, and thus the server should + /// be consulted for whether the cached response ought to be used or not. + fn must_revalidate() -> CacheControlDirective { + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![], + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cache_control_token() { + let cc: CacheControl = CacheControlParser::new(["no-cache"]).collect(); + assert!(cc.no_cache); + assert!(!cc.must_revalidate); + } + + #[test] + fn cache_control_max_age() { + let cc: CacheControl = CacheControlParser::new(["max-age=60"]).collect(); + assert_eq!(Some(60), cc.max_age_seconds); + assert!(!cc.must_revalidate); + } + + // [RFC 9111 S5.2.1.1] says that client MUST NOT quote max-age, but we + // support parsing it that way anyway. + // + // [RFC 9111 S5.2.1.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.1 + #[test] + fn cache_control_max_age_quoted() { + let cc: CacheControl = CacheControlParser::new([r#"max-age="60""#]).collect(); + assert_eq!(Some(60), cc.max_age_seconds); + assert!(!cc.must_revalidate); + } + + #[test] + fn cache_control_max_age_invalid() { + let cc: CacheControl = CacheControlParser::new(["max-age=6a0"]).collect(); + assert_eq!(None, cc.max_age_seconds); + assert!(cc.must_revalidate); + } + + #[test] + fn cache_control_immutable() { + let cc: CacheControl = CacheControlParser::new(["max-age=31536000, immutable"]).collect(); + assert_eq!(Some(31_536_000), cc.max_age_seconds); + assert!(cc.immutable); + assert!(!cc.must_revalidate); + } + + #[test] + fn cache_control_unrecognized() { + let cc: CacheControl = CacheControlParser::new(["lion,max-age=60,zebra"]).collect(); + assert_eq!(Some(60), cc.max_age_seconds); + } + + #[test] + fn cache_control_invalid_squashes_remainder() { + let cc: CacheControl = CacheControlParser::new(["no-cache,\x00,max-age=60"]).collect(); + // The invalid data doesn't impact things before it. + assert!(cc.no_cache); + // The invalid data precludes parsing anything after. + assert_eq!(None, cc.max_age_seconds); + // The invalid contents should force revalidation. + assert!(cc.must_revalidate); + } + + #[test] + fn cache_control_invalid_squashes_remainder_but_not_other_header_values() { + let cc: CacheControl = + CacheControlParser::new(["no-cache,\x00,max-age=60", "max-stale=30"]).collect(); + // The invalid data doesn't impact things before it. + assert!(cc.no_cache); + // The invalid data precludes parsing anything after + // in the same header value, but not in other + // header values. + assert_eq!(Some(30), cc.max_stale_seconds); + // The invalid contents should force revalidation. + assert!(cc.must_revalidate); + } + + #[test] + fn cache_control_parse_token() { + let directives = CacheControlParser::new(["no-cache"]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }] + ); + } + + #[test] + fn cache_control_parse_token_to_token_value() { + let directives = CacheControlParser::new(["max-age=60"]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }] + ); + } + + #[test] + fn cache_control_parse_token_to_quoted_string() { + let directives = + CacheControlParser::new([r#"private="cookie,x-something-else""#]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "private".to_string(), + value: b"cookie,x-something-else".to_vec(), + }] + ); + } + + #[test] + fn cache_control_parse_token_to_quoted_string_with_escape() { + let directives = + CacheControlParser::new([r#"private="something\"crazy""#]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "private".to_string(), + value: br#"something"crazy"#.to_vec(), + }] + ); + } + + #[test] + fn cache_control_parse_multiple_directives() { + let header = r#"max-age=60, no-cache, private="cookie", no-transform"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "private".to_string(), + value: b"cookie".to_vec(), + }, + CacheControlDirective { + name: "no-transform".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_multiple_directives_across_multiple_header_values() { + let headers = [ + r#"max-age=60, no-cache"#, + r#"private="cookie""#, + r#"no-transform"#, + ]; + let directives = CacheControlParser::new(headers).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "private".to_string(), + value: b"cookie".to_vec(), + }, + CacheControlDirective { + name: "no-transform".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_one_header_invalid() { + let headers = [ + r#"max-age=60, no-cache"#, + r#", private="cookie""#, + r#"no-transform"#, + ]; + let directives = CacheControlParser::new(headers).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "no-transform".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_invalid_directive_drops_remainder() { + let header = r#"max-age=60, no-cache, ="cookie", no-transform"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_name_normalized() { + let header = r#"MAX-AGE=60"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + },] + ); + } + + // When a duplicate directive is found, we keep the first one + // and add in a `must-revalidate` directive to indicate that + // things are stale and the client should do a re-check. + #[test] + fn cache_control_parse_duplicate_directives() { + let header = r#"max-age=60, no-cache, max-age=30"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } + + #[test] + fn cache_control_parse_duplicate_directives_across_headers() { + let headers = [r#"max-age=60, no-cache"#, r#"max-age=30"#]; + let directives = CacheControlParser::new(headers).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } + + // Tests that we don't emit must-revalidate multiple times + // even when something is duplicated multiple times. + #[test] + fn cache_control_parse_duplicate_redux() { + let header = r#"max-age=60, no-cache, no-cache, max-age=30"#; + let directives = CacheControlParser::new([header]).collect::>(); + assert_eq!( + directives, + vec![ + CacheControlDirective { + name: "max-age".to_string(), + value: b"60".to_vec(), + }, + CacheControlDirective { + name: "no-cache".to_string(), + value: vec![] + }, + CacheControlDirective { + name: "must-revalidate".to_string(), + value: vec![] + }, + ] + ); + } +} diff --git a/crates/puffin-client/src/httpcache/mod.rs b/crates/puffin-client/src/httpcache/mod.rs new file mode 100644 index 000000000..8e1344865 --- /dev/null +++ b/crates/puffin-client/src/httpcache/mod.rs @@ -0,0 +1,1358 @@ +/*! +A somewhat simplistic implementation of HTTP cache semantics. + +This implementation was guided by the following things: + +* RFCs 9110 and 9111. +* The `http-cache-semantics` crate. (The implementation here is completely +different, but the source of `http-cache-semantics` helped guide the +implementation here and understanding of HTTP caching.) +* A desire for our cache policy to support zero-copy deserialization. That +is, we want the cached response fast path (where no revalidation request is +necessary) to avoid any costly deserialization for the cache policy at all. + +# Flow + +While one has to read the relevant RFCs to get a full understanding of HTTP +caching, doing so is... difficult to say the least. It is at the very least +not quick to do because the semantics are scattered all over the place. But, I +think we can do a quick overview here. + +Let's start with the obvious. HTTP caching exists to avoid network requests, +and, if a request is unavoidable, bandwidth. The central actor in HTTP +caching is the `Cache-Control` header, which can exist on *both* requests and +resonses. The value of this header is a list of directives that control caching +behavior. They can outright disable it (`no-store`), force cache invalidation +(`no-cache`) or even permit the cache to return responses that are explicitly +stale (`max-stale`). + +The main thing that typically drives cache interactions is `max-age`. When set +on a response, this means that the server is willing to let clients hold on to +a response for up to the amount of time in `max-age` before the client must ask +the server for a fresh response. In our case, the main utility of `max-age` is +two fold: + +* PyPI sets a `max-age` of 600 seconds (10 minutes) on its responses. As long +as our cached responses have an age less than this, we can completely avoid +talking to PyPI at all when we need access to the full set of versions for a +package. +* Most other assets, like wheels, are forever immutable. They will never +change. So servers will typically set a very high `max-age`, which means we +will almost never need to ask the server for permission to reuse our cached +wheel. + +When a cached response exceeds the `max-age` configured on a response, then +we call that response stale. Generally speaking, we won't return responses +from the cache that are known to be stale. (This can be overridden in the +request by adding a `max-stale` cache-control directive, but nothing in Puffin +does this at time of writing.) When a response is stale, we don't necessarily +need to give up completely. It is at this point that we can send something +called a re-validation request. + +A re-validation request includes with it some metadata (usually an "entity tag" +or `etag` for short) that was on the cached response (which is now stale). +When we send this request, the server can compare it with its most up to date +version of the resource. If its entity tag matches the one we gave it (among +other possible criteria), then the server can skip returning the body and +instead just return a small HTTP 304 NOT MODIFIED response. When we get this +type of response, it's the server telling us that our cached response which we +*thought* was stale is no longer stale. It's fresh again and we needn't get a +new copy. We will need to update our stored `CachePolicy` though, since the +HTTP 304 NOT MODIFIED response we got might included updated metadata relevant +to the behavior of caching (like a new `Age` header). + +# Scope + +In general, the cache semantics implemented below are targeted toward Puffin's +use case: a private client cache for custom data objects. This constraint +results in a modest simplification in what we need to support. That is, we +don't need to cache the entirety of the request's or response's headers (like +what `http-cache-semantics`) does. Instead, we only need to cache the data +necessary to *make decisions* about HTTP caching. + +One example of this is the `Vary` response header. This requires checking the +the headers listed in a cached response have the same value in the original +request and the new request. If the new request has different values for those +headers (as specified in the cached response) than what was in the original +request, then the new request cannot used our cached response. Normally, this +would seemingly require storing all of the original request's headers. But we +only store the headers listed in the response. + +Also, since we aren't a proxy, there are a host of proxy-specific rules for +managing headers and data that we needn't care about. + +# Zero-copy deserialization + +As mentioned above, we would really like our fast path (that is, a cached +response that we deem "fresh" and thus don't need to send a re-validation +request for) to avoid needing to actually deserialize a `CachePolicy`. While a +`CachePolicy` isn't particularly big, it is in our critical path. Yet, we still +need a `CachePolicy` to be able to decide whether a cached response is still +fresh or not. (This decision procedure is non-trivial, so it *probably* doesn't +make too much sense to hack around it with something simpler.) + +We attempt to achieve this by implementing the `rkyv` traits for all of our +types. This means that if we read a `Vec` from a file, then we can very +cheaply turn that into a `rkyvutil::OwnedArchive`. Creating that +only requires a quick validation step, but is otherwise free. We can then +use that as-if it were an `Archived` (which is an alias for the +`ArchivedCachePolicy` type implicitly introduced by `derive(rkyv::Archive)`). +Crucially, this is why we implement all of our HTTP cache semantics logic on +`ArchivedCachePolicy` and *not* `CachePolicy`. It can be easy to forget this +because `rkyv` does such an amazing job of making its use of archived types +very closely resemble that of the standard types. For example, whenever the +methods below are accessing a field whose type is a `Vec` in the normal type, +what's actually being accessed is a [`rkyv::vec::ArchivedVec`]. Similarly, +for strings, it's [`rkyv::string::ArchivedString`] and not a standard library +`String`. This all works somewhat seamlessly because all of the cache semantics +are generally just read-only operations, but if you stray from the path, you're +likely to get whacked over the head. + +One catch here is that we actually want the HTTP cache semantics to be +available on `CachePolicy` too. At least, at time of writing, we do. To +achieve this `CachePolicy::to_archived` is provided, which will serialize the +`CachePolicy` to its archived representation in bytes, and then turn that +into an `OwnedArchive` which derefs to `ArchivedCachePolicy`. +This is a little extra cost, but the idea is that a `CachePolicy` (not an +`ArchivedCachePolicy`) should only be used in the slower path (i.e., when you +actually need to make an HTTP request). + +[`rkyv::vec::ArchivedVec`]: hhttps://docs.rs/rkyv/0.7.43/rkyv/vec/struct.ArchivedVec.html +[`rkyv::string::ArchivedString`]: https://docs.rs/rkyv/0.7.43/rkyv/string/struct.ArchivedString.html + +# Additional reading + +* Short introduction to `Cache-Control`: https://csswizardry.com/2019/03/cache-control-for-civilians/ +* Caching best practcies: https://jakearchibald.com/2016/caching-best-practices/ +* Overview of HTTP caching: https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching +* MDN docs for `Cache-Control`: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control +* THe 1997 RFC for HTTP 1.1: https://www.rfc-editor.org/rfc/rfc2068#section-13 +* The 1999 update to HTTP 1.1: https://www.rfc-editor.org/rfc/rfc2616.html#section-13 +* The "stale content" cache-control extension: https://httpwg.org/specs/rfc5861.html +* HTTP 1.1 caching (superseded by RFC 9111): https://httpwg.org/specs/rfc7234.html +* The "immutable" cache-control extension: https://httpwg.org/specs/rfc8246.html +* HTTP semantics (If-None-Match, etc.): https://www.rfc-editor.org/rfc/rfc9110#section-8.8.3 +* HTTP caching (obsoletes RFC 7234): https://www.rfc-editor.org/rfc/rfc9111.html +*/ + +use std::time::{Duration, SystemTime}; + +use {http::header::HeaderValue, rkyv::bytecheck}; + +use crate::rkyvutil::OwnedArchive; + +use self::control::CacheControl; + +mod control; + +/// Knobs to configure Puffin's cache behavior. +/// +/// At time of writing, we don't expose any way of modifying these since I +/// suspect we won't ever need to. We split them out into their own type so +/// that they can be shared between `CachePolicyBuilder` and `CachePolicy`. +#[derive(Clone, Debug, rkyv::Archive, rkyv::CheckBytes, rkyv::Deserialize, rkyv::Serialize)] +// Since `CacheConfig` is so simple, we can use itself as the archived type. +// But note that this will fall apart if even something like an Option is +// added. +#[archive(as = "Self")] +#[repr(C)] +struct CacheConfig { + shared: bool, + heuristic_percent: u8, +} + +impl Default for CacheConfig { + fn default() -> CacheConfig { + CacheConfig { + // The caching puffin does ought to be considered + // private. + shared: false, + // This is only used to heuristically guess at a freshness lifetime + // when other indicators (such as `max-age` and `Expires` are + // absent. + heuristic_percent: 10, + } + } +} + +/// A builder for constructing a `CachePolicy`. +/// +/// A builder can be used directly when spawning fresh HTTP requests +/// without a cached response. A builder is also constructed for you via +/// [`CachePolicy::before_request`] when a cached response exists but is stale. +/// +/// The main idea of a builder is that it manages the flow of data needed to +/// construct a `CachePolicy`. That is, you start with an HTTP request, then +/// you get a response and finally a new `CachePolicy`. +#[derive(Debug)] +pub struct CachePolicyBuilder { + /// The configuration controlling the behavior of the cache. + config: CacheConfig, + /// A subset of information from the HTTP request that we will store. This + /// is needed to make future decisions about cache behavior. + request: Request, + /// The full set of request headers. This copy is necessary because the + /// headers are needed in order to correctly capture the values necessary + /// to implement the `Vary` check, as per [RFC 9111 S4.1]. The upside is + /// that this is not actually persisted in a `CachePolicy`. We only need it + /// until we have the response. + /// + /// The precise reason why this copy is intrinsically needed is because + /// sending a request requires ownership of the request. Yet, we don't know + /// which header values we need to store in our cache until we get the + /// response back. Thus, these headers must be persisted until after the + /// point we've given up ownership of the request. + /// + /// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 + request_headers: http::HeaderMap, +} + +impl CachePolicyBuilder { + /// Create a new builder of a cache policy, starting with the request. + pub fn new(request: &reqwest::Request) -> CachePolicyBuilder { + let config = CacheConfig::default(); + let request_headers = request.headers().clone(); + let request = Request::from(request); + CachePolicyBuilder { + config, + request, + request_headers, + } + } + + /// Return a new policy given the response to the request that this builder + /// was created with. + pub fn build(self, response: &reqwest::Response) -> CachePolicy { + let vary = Vary::from_request_response_headers(&self.request_headers, response.headers()); + CachePolicy { + config: self.config, + request: self.request, + response: Response::from(response), + vary, + } + } +} + +/// A value encapsulating the data needed to implement HTTP caching behavior +/// for Puffin. +/// +/// A cache policy is meant to be stored and persisted with the data being +/// cached. It is specifically meant to capture the smallest amount of +/// information needed to determine whether a cached response is stale or not, +/// and the information required to issue a re-validation request. +/// +/// This does not provide a complete set of HTTP cache semantics. Notably +/// absent from this (among other things that Puffin probably doesn't care +/// about it) are proxy cache semantics. +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +pub struct CachePolicy { + /// The configuration controlling the behavior of the cache. + config: CacheConfig, + /// A subset of information from the HTTP request that we will store. This + /// is needed to make future decisions about cache behavior. + request: Request, + /// A subset of information from the HTTP response that we will store. This + /// is needed to make future decisions about cache behavior. + response: Response, + /// This contains the set of vary header names (from the cached response) + /// and the corresponding values (from the original request) used to verify + /// whether a new request can utilize a cached response or not. This is + /// placed outside of `request` and `response` because it contains bits + /// from both! + vary: Vary, +} + +impl CachePolicy { + /// Convert this to an owned archive value. + /// + /// It's necessary to call this in order to make decisions with this cache + /// policy. Namely, all of the cache semantics logic is implemented on the + /// archived types. + /// + /// These do incur an extra cost, but this should only be needed when you + /// don't have an `ArchivedCachePolicy`. And that should only occur when + /// you're actually performing an HTTP request. In that case, the extra + /// cost that is done here to convert a `CachePolicy` to its archived form + /// should be marginal. + pub fn to_archived(&self) -> OwnedArchive { + // There's no way (other than OOM) for serializing this type to fail. + OwnedArchive::from_unarchived(self).expect("all possible values can be archived") + } +} + +impl ArchivedCachePolicy { + /// Determines what caching behavior is correct given an existing + /// `CachePolicy` and a new HTTP request for the resource managed by this + /// cache policy. This is done as per [RFC 9111 S4]. + /// + /// Calling this method conceptually corresponds to asking the following + /// question: "I have a cached response for an incoming HTTP request. May I + /// return that cached response, or do I need to go back to the progenitor + /// of that response to determine whether it's still the latest thing?" + /// + /// This returns one of three possible behaviors: + /// + /// 1. The cached response is still fresh, and the caller may return + /// the cached response without issuing an HTTP requests. + /// 2. The cached response is stale. The caller should send a re-validation + /// request and then call `CachePolicy::after_response` to determine whether + /// the cached response is actually fresh, or if it's stale and needs to + /// be updated. + /// 3. The given request does not match the cache policy identification. + /// Generally speaking, this usually implies a bug with the cache in that + /// it loaded a cache policy that does not match the request. + /// + /// In the case of (2), the given request is modified in place such that + /// it is suitable as a revalidation request. + /// + /// [RFC 9111 S4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4 + pub fn before_request(&self, request: &mut reqwest::Request) -> BeforeRequest { + let now = SystemTime::now(); + // If the response was never storable, then we just bail out + // completely. + if !self.is_storable() { + tracing::trace!( + "request {} does not match cache request {} because it isn't storable", + request.url(), + self.request.uri, + ); + return BeforeRequest::NoMatch; + } + // "When presented with a request, a cache MUST NOT reuse a stored + // response unless..." + // + // "the presented target URI and that of the stored response match, + // and..." + if self.request.uri != request.url().as_str() { + tracing::trace!( + "request {} does not match cache URL of {}", + request.url(), + self.request.uri, + ); + return BeforeRequest::NoMatch; + } + // "the request method associated with the stored response allows it to + // be used for the presented request, and..." + if request.method() != http::Method::GET && request.method() != http::Method::HEAD { + tracing::trace!( + "method {:?} for request {} is not supported by this cache", + request.method(), + request.url(), + ); + return BeforeRequest::NoMatch; + } + // "request header fields nominated by the stored response (if any) + // match those presented, and..." + // + // We don't support the `Vary` header, so if it was set, we + // conservatively require revalidation. + if !self.vary.matches(request.headers()) { + tracing::trace!( + "request {} does not match cached request because of the 'Vary' header", + request.url(), + ); + self.set_revalidation_headers(request); + return BeforeRequest::Stale(self.new_cache_policy_builder(request)); + } + // "the stored response does not contain the no-cache directive, unless + // it is successfully validated, and..." + if self.response.headers.cc.no_cache { + self.set_revalidation_headers(request); + return BeforeRequest::Stale(self.new_cache_policy_builder(request)); + } + // "the stored response is one of the following: ..." + // + // "fresh, or..." + // "allowed to be served stale, or..." + if self.is_fresh(now, request) { + return BeforeRequest::Fresh; + } + // "successfully validated." + // + // In this case, callers will need to send a revalidation request. + self.set_revalidation_headers(request); + BeforeRequest::Stale(self.new_cache_policy_builder(request)) + } + + /// This implements the logic for handling the response to a request that + /// may be a revalidation request, as per [RFC 9111 S4.3.3] and [RFC 9111 + /// S4.3.4]. That is, the cache policy builder given here should be the one + /// returned by `CachePolicy::before_request` with the response received + /// from the origin server for the possibly-revalidating request. + /// + /// Even if the request is new (in that there is no response cached + /// for it), callers may use this routine. But generally speaking, + /// callers are only supposed to use this routine after getting a + /// [`BeforeRequest::Stale`]. + /// + /// The return value indicates whether the cached response is still fresh + /// (that is, `AfterResponse::NotModified`) or if it has changed (that is, + /// `AfterResponse::Modified`). In the latter case, the cached response has + /// been invalidated and the caller should cache the new response. In the + /// former case, the cached response is still considered fresh. + /// + /// In either case, callers should update their cache with the new policy. + /// + /// [RFC 9111 S4.3.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.3 + /// [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + pub fn after_response( + &self, + cache_policy_builder: CachePolicyBuilder, + response: &reqwest::Response, + ) -> AfterResponse { + let mut new_policy = cache_policy_builder.build(response); + if self.is_modified(&new_policy) { + AfterResponse::Modified(new_policy) + } else { + new_policy.response.status = self.response.status; + AfterResponse::NotModified(new_policy) + } + } + + fn is_modified(&self, new_policy: &CachePolicy) -> bool { + // From [RFC 9111 S4.3.3], + // + // "A 304 (Not Modified) response status code indicates that the stored + // response can be updated and reused" + // + // So if we don't get a 304, then we know our cached response is seen + // as stale by the origin server. + // + // [RFC 9111 S4.3.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.3 + if new_policy.response.status != 304 { + return true; + } + // As per [RFC 9111 S4.3.4], we need to confirm that our validators match. Here, + // we check `ETag`. + // + // [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + if let Some(old_etag) = self.response.headers.etag.as_ref() { + if let Some(new_etag) = new_policy.response.headers.etag.as_ref() { + // We don't support weak validators, so only match if they're + // both strong. + if !old_etag.weak && !new_etag.weak && old_etag.value == new_etag.value { + return false; + } + } + } + // As per [RFC 9111 S4.3.4], we need to confirm that our validators match. Here, + // we check `Last-Modified`. + // + // [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + if let Some(old_last_modified) = self.response.headers.last_modified_unix_timestamp.as_ref() + { + if let Some(new_last_modified) = new_policy + .response + .headers + .last_modified_unix_timestamp + .as_ref() + { + if old_last_modified == new_last_modified { + return false; + } + } + } + // As per [RFC 9111 S4.3.4], if we have no validators anywhere, then + // we can just rely on the HTTP 304 status code and reuse the cached + // response. + // + // [RFC 9111 S4.3.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.4 + if self.response.headers.etag.is_none() + && new_policy.response.headers.etag.is_none() + && self.response.headers.last_modified_unix_timestamp.is_none() + && new_policy + .response + .headers + .last_modified_unix_timestamp + .is_none() + { + return false; + } + true + } + + /// Sets the relevant headers on the given request so that it can be used + /// as a revalidation request. As per [RFC 9111 S4.3.1], this permits the + /// origin server to check if the content is different from our cached + /// response. If it isn't, then the origin server can return an HTTP 304 + /// NOT MODIFIED status, which avoids the need to re-transmit the response + /// body. That is, it indicates that our cached response is still fresh. + /// + /// This will always use a strong etag validator if it's present on the + /// cached response. If the given request already has an etag validator + /// on it, this routine will add to it and not replace it. + /// + /// In contrast, if the request already has the `If-Modified-Since` header + /// set, then this will not change or replace it. If it's not set, then one + /// is added if the cached response had a valid `Last-Modified` header. + /// + /// [RFC 9111 S4.3.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.1 + fn set_revalidation_headers(&self, request: &mut reqwest::Request) { + // As per [RFC 9110 13.1.2] and [RFC 9111 S4.3.1], if our stored + // response has an etag, we should send it back via the `If-None-Match` + // header. The idea is that the server should only "do" the request if + // none of the tags match. If there is a match, then the server can + // return HTTP 304 indicating that our stored response is still fresh. + // + // [RFC 9110 S13.1.2]: https://www.rfc-editor.org/rfc/rfc9110#section-13.1.2 + // [RFC 9111 S4.3.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.1 + if let Some(etag) = self.response.headers.etag.as_ref() { + // We don't support weak validation principally because we want to + // be notified if there was a change in the content. Namely, from + // RFC 9110 S13.1.2: "... weak entity tags can be used for cache + // validation even if there have been changes to the representation + // data." + if !etag.weak { + if let Ok(header) = HeaderValue::from_bytes(&etag.value) { + request.headers_mut().append("if-none-match", header); + } + } + } + // We also set `If-Modified-Since` as per [RFC 9110 S13.1.3] and [RFC + // 9111 S4.3.1]. Generally, `If-None-Match` will override this, but we + // set it in case `If-None-Match` is not supported. + // + // [RFC 9110 S13.1.3]: https://www.rfc-editor.org/rfc/rfc9110#section-13.1.3 + // [RFC 9111 S4.3.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.3.1 + if !request.headers().contains_key("if-modified-since") { + if let Some(&last_modified_unix_timestamp) = + self.response.headers.last_modified_unix_timestamp.as_ref() + { + if let Some(last_modified) = unix_timestamp_to_header(last_modified_unix_timestamp) + { + request + .headers_mut() + .insert("if-modified-since", last_modified); + } + } + } + } + + /// Returns true if and only if the response is storable as per + /// [RFC 9111 S3]. + /// + /// [RFC 9111 S3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-3 + pub fn is_storable(&self) -> bool { + // In the absence of other signals, we are limited to caching responses + // with a code that is heuristically cacheable as per [RFC 9110 S15.1]. + // + // [RFC 9110 S15.1]: https://www.rfc-editor.org/rfc/rfc9110#section-15.1 + const HEURISTICALLY_CACHEABLE_STATUS_CODES: &[u16] = + &[200, 203, 204, 206, 300, 301, 308, 404, 405, 410, 414, 501]; + + // N.B. This routine could be "simpler", but we bias toward + // following the flow of logic as closely as possible as written + // in RFC 9111 S3. + + // "the request method is understood by the cache" + // + // We just don't bother with anything that isn't GET. + if !matches!( + self.request.method, + ArchivedMethod::Get | ArchivedMethod::Head + ) { + tracing::trace!( + "cached request {} is not storable because of its method {:?}", + self.request.uri, + self.request.method + ); + return false; + } + // "the response status code is final" + // + // ... and we'll put more restrictions on status code + // below, but we can bail out early here. + if !self.response.has_final_status() { + tracing::trace!( + "cached request {} is not storable because its response has \ + non-final status code {:?}", + self.request.uri, + self.response.status, + ); + return false; + } + // "if the response status code is 206 or 304, or the must-understand + // cache directive (see Section 5.2.2.3) is present: the cache + // understands the response status code" + // + // We don't currently support `must-understand`. We also don't support + // partial content (206). And 304 not modified shouldn't be cached + // itself. + if self.response.status == 206 || self.response.status == 304 { + tracing::trace!( + "cached request {} is not storable because its response has \ + unsupported status code {:?}", + self.request.uri, + self.response.status, + ); + return false; + } + // "The no-store request directive indicates that a cache MUST NOT + // store any part of either this request or any response to it." + // + // (This is from RFC 9111 S5.2.1.5, and doesn't seem to be mentioned in + // S3.) + if self.request.headers.cc.no_store { + tracing::trace!( + "cached request {} is not storable because its request has \ + a 'no-store' cache-control directive", + self.request.uri, + ); + return false; + } + // "the no-store cache directive is not present in the response" + if self.response.headers.cc.no_store { + tracing::trace!( + "cached request {} is not storable because its response has \ + a 'no-store' cache-control directive", + self.request.uri, + ); + return false; + } + // "if the cache is shared ..." + if self.config.shared { + // "if the cache is shared: the private response directive is either + // not present or allows a shared cache to store a modified response" + // + // We don't support more granular "private" directives (which allow + // caching all of a private HTTP response in a shared cache only after + // removing some subset of the response's headers that are deemed + // private). + if self.response.headers.cc.private { + tracing::trace!( + "cached request {} is not storable because this is a shared \ + cache and its response has a 'private' cache-control directive", + self.request.uri, + ); + return false; + } + // "if the cache is shared: the Authorization header field is not + // present in the request or a response directive is present that + // explicitly allows shared caching" + if self.request.headers.authorization && !self.allows_authorization_storage() { + tracing::trace!( + "cached request {} is not storable because this is a shared \ + cache and the request has an 'Authorization' header set and \ + the response has indicated that caching requests with an \ + 'Authorization' header is allowed", + self.request.uri, + ); + return false; + } + } + + // "the response contains at least one of the following ..." + // + // "a public response directive" + if self.response.headers.cc.public { + tracing::trace!( + "cached request {} is storable because its response has \ + a 'public' cache-control directive", + self.request.uri, + ); + return true; + } + // "a private response directive, if the cache is not shared" + if !self.config.shared && self.response.headers.cc.private { + tracing::trace!( + "cached request {} is storable because this is a shared cache \ + and its response has a 'private' cache-control directive", + self.request.uri, + ); + return true; + } + // "an Expires header field" + if self.response.headers.expires_unix_timestamp.is_some() { + tracing::trace!( + "cached request {} is storable because its response has an \ + 'Expires' header set", + self.request.uri, + ); + return true; + } + // "a max-age response directive" + if self.response.headers.cc.max_age_seconds.is_some() { + tracing::trace!( + "cached request {} is storable because its response has an \ + 'max-age' cache-control directive", + self.request.uri, + ); + return true; + } + // "if the cache is shared: an s-maxage response directive" + if self.config.shared && self.response.headers.cc.s_maxage_seconds.is_some() { + tracing::trace!( + "cached request {} is storable because this is a shared cache \ + and its response has a 's-maxage' cache-control directive", + self.request.uri, + ); + return true; + } + // "a cache extension that allows it to be cached" + // ... we don't support any extensions. + // + // "a status code that is defined as heuristically cacheable" + if HEURISTICALLY_CACHEABLE_STATUS_CODES.contains(&self.response.status) { + tracing::trace!( + "cached request {} is storable because its response has a \ + heuristically cacheable status code {:?}", + self.request.uri, + self.response.status, + ); + return true; + } + tracing::trace!( + "cached response {} is not storable because it does not meet any \ + of the necessary criteria (e.g., it doesn't have an 'Expires' \ + header set or a 'max-age' cache-control directive)", + self.request.uri, + ); + false + } + + /// Returns true when a response is storable even if it has an + /// `Authorization` header, as per [RFC 9111 S3.5]. + /// + /// [RFC 9111 S3.5]: https://www.rfc-editor.org/rfc/rfc9111.html#section-3.5 + fn allows_authorization_storage(&self) -> bool { + self.response.headers.cc.must_revalidate + || self.response.headers.cc.public + || self.response.headers.cc.s_maxage_seconds.is_some() + } + + /// Returns true if the response is considered fresh as per [RFC 9111 + /// S4.2]. If the response is not fresh, then it considered stale and ought + /// to be revalidated with the origin server. + /// + /// [RFC 9111 S4.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2 + fn is_fresh(&self, now: SystemTime, request: &reqwest::Request) -> bool { + let freshness_lifetime = self.freshness_lifetime().as_secs(); + let age = self.age(now).as_secs(); + + // Per RFC 8246, the `immutable` directive means that a reload from an + // end user should not result in a revlalidation request. Indeed, the + // `immutable` directive seems to imply that clients should never talk + // to the origin server until the cached response is stale with respect + // to its freshness lifetime (as set by the server). + // + // A *force* reload from an end user should override this, but we + // currently have no path for that in this implementation. Instead, we + // just interpret `immutable` as meaning that any directives on the + // new request that would otherwise result in sending a revalidation + // request are ignored. + // + // [RFC 8246]: https://httpwg.org/specs/rfc8246.html + if !self.response.headers.cc.immutable { + let reqcc = request + .headers() + .get_all("cache-control") + .iter() + .collect::(); + + // As per [RFC 9111 S5.2.1.4], if the request has `no-cache`, then we should + // respect that. + // + // [RFC 9111 S5.2.1.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.4 + if reqcc.no_cache { + tracing::trace!( + "request {} does not have a fresh cache because \ + it has a 'no-cache' cache-control directive", + request.url(), + ); + return false; + } + + // If the request has a max-age directive, then we should respect that + // as per [RFC 9111 S5.2.1.1]. + // + // [RFC 9111 S5.2.1.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.1 + if let Some(&max_age) = reqcc.max_age_seconds.as_ref() { + if age > max_age { + tracing::trace!( + "request {} does not have a fresh cache because \ + the cached response's age is {} seconds and the max age \ + allowed by the request is {} seconds", + request.url(), + age, + max_age, + ); + return false; + } + } + + // If the request has a min-fresh directive, then we only consider a + // cached response fresh if the remaining time it has to live exceeds + // the threshold provided, as per [RFC 9111 S5.2.1.3]. + // + // [RFC 9111 S5.2.1.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.3 + if let Some(&min_fresh) = reqcc.min_fresh_seconds.as_ref() { + let time_to_live = freshness_lifetime.saturating_sub(unix_timestamp(now)); + if time_to_live < min_fresh { + tracing::trace!( + "request {} does not have a fresh cache because \ + the request set a 'min-fresh' cache-control directive, \ + and its time-to-live is {} seconds but it needs to be \ + at least {} seconds", + request.url(), + time_to_live, + min_fresh, + ); + // Note that S5.2.1.3 does not say that max-stale overrides + // this, so we ignore it here. + return false; + } + } + } + if age > freshness_lifetime { + let allows_stale = self.allows_stale(now); + if !allows_stale { + tracing::trace!( + "request {} does not have a fresh cache because \ + its age is {} seconds, it is greater than the freshness \ + lifetime of {} seconds and stale cached responses are not \ + allowed", + request.url(), + age, + freshness_lifetime, + ); + return false; + } + } + true + } + + /// Returns true if we're allowed to serve a stale response, as per [RFC + /// 9111 S4.2.4]. + /// + /// [RFC 9111 S4.2.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.4 + fn allows_stale(&self, now: SystemTime) -> bool { + // As per [RFC 9111 S5.2.2.2], if `must-revalidate` is present, then + // caches cannot reuse a stale response without talking to the server + // first. Note that RFC 9111 doesn't seem to say anything about the + // interaction between must-revalidate and max-stale, so we assume that + // must-revalidate takes precedent. + // + // [RFC 9111 S5.2.2.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.2.2 + if self.response.headers.cc.must_revalidate { + tracing::trace!( + "cached request {} has a cached response that does not \ + permit staleness because the response has a 'must-revalidate' \ + cache-control directive set", + self.request.uri, + ); + return false; + } + if let Some(&max_stale) = self.request.headers.cc.max_stale_seconds.as_ref() { + // As per [RFC 9111 S5.2.1.2], if the client has max-stale set, + // then stale responses are allowed, but only if they are stale + // within a given threshold. + // + // [RFC 9111 S5.2.1.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.2.1.2 + let stale_amount = self + .age(now) + .as_secs() + .saturating_sub(self.freshness_lifetime().as_secs()); + if stale_amount <= max_stale { + tracing::trace!( + "cached request {} has a cached response that allows staleness \ + in this case because the stale amount is {} seconds and the \ + 'max-stale' cache-control directive set by the cached request \ + is {} seconds", + self.request.uri, + stale_amount, + max_stale, + ); + return true; + } + } + // As per [RFC 9111 S4.2.4], we shouldn't use stale responses unless + // we're explicitly allowed to (e.g., via `max-stale` above): + // + // "A cache MUST NOT generate a stale response unless it is + // disconnected or doing so is explicitly permitted by the client or + // origin server..." + // + // [RFC 9111 S4.2.4]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.4 + tracing::trace!( + "cached request {} has a cached response that does not allow staleness", + self.request.uri, + ); + false + } + + /// Returns the age of the HTTP response as per [RFC 9111 S4.2.3]. + /// + /// The age of a response, essentially, refers to how long it has been + /// since the response was created by the origin server. The age is used + /// to compare with the freshness lifetime of the response to determine + /// whether the response is fresh or stale. + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#name-calculating-age + fn age(&self, now: SystemTime) -> Duration { + // RFC 9111 S4.2.3 + let apparent_age = self + .response + .unix_timestamp + .saturating_sub(self.response.header_date()); + let response_delay = self + .response + .unix_timestamp + .saturating_sub(self.request.unix_timestamp); + let corrected_age_value = self.response.header_age().saturating_add(response_delay); + let corrected_initial_age = apparent_age.max(corrected_age_value); + let resident_age = unix_timestamp(now).saturating_sub(self.response.unix_timestamp); + let current_age = corrected_initial_age + resident_age; + Duration::from_secs(current_age) + } + + /// Returns how long a response should be considered "fresh" as per + /// [RFC 9111 S4.2.1]. When this returns false, the response should be + /// considered stale and the client should revalidate with the server. + /// + /// If there are no indicators of a response's freshness lifetime, then + /// this returns `0`. That is, the response will be considered stale in all + /// cases. + /// + /// [RFC 9111 S4.2.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.1 + fn freshness_lifetime(&self) -> Duration { + if self.config.shared { + if let Some(&s_maxage) = self.response.headers.cc.s_maxage_seconds.as_ref() { + return Duration::from_secs(s_maxage); + } + } + if let Some(&max_age) = self.response.headers.cc.max_age_seconds.as_ref() { + return Duration::from_secs(max_age); + } + if let Some(&expires) = self.response.headers.expires_unix_timestamp.as_ref() { + return Duration::from_secs(expires.saturating_sub(self.response.header_date())); + } + if let Some(&last_modified) = self.response.headers.last_modified_unix_timestamp.as_ref() { + let interval = self.response.header_date().saturating_sub(last_modified); + let percent = u64::from(self.config.heuristic_percent); + return Duration::from_secs(interval.saturating_mul(percent).saturating_div(100)); + } + // Without any indicators as to the freshness lifetime, we act + // conservatively and use a value that will always result in a response + // being treated as stale. + Duration::ZERO + } + + fn new_cache_policy_builder(&self, request: &reqwest::Request) -> CachePolicyBuilder { + let request_headers = request.headers().clone(); + CachePolicyBuilder { + config: self.config.clone(), + request: Request::from(request), + request_headers, + } + } +} + +/// The result of calling [`CachePolicy::before_request`]. +/// +/// This dictates what the caller should do next by indicating whether the +/// cached response is stale or not. +#[derive(Debug)] +pub enum BeforeRequest { + /// The cached response is still fresh, and the caller may return the + /// cached response without issuing an HTTP requests. + Fresh, + /// The cached response is stale. The caller should send a re-validation + /// request and then call `CachePolicy::after_response` to determine + /// whether the cached response is actually fresh, or if it's stale and + /// needs to be updated. + Stale(CachePolicyBuilder), + /// The given request does not match the cache policy identification. + /// Generally speaking, this is usually implies a bug with the cache in + /// that it loaded a cache policy that does not match the request. + NoMatch, +} + +/// The result of called [`CachePolicy::after_response`]. +/// +/// This is meant to report whether a revalidation request was successful or +/// not. If it was, then a `AfterResponse::NotModified` is returned. Otherwise, +/// the server determined the cached response was truly stale and in need of +/// updated. +#[derive(Debug)] +pub enum AfterResponse { + /// The cached response is still fresh. + NotModified(CachePolicy), + /// The cached response has been invalidated and needs to be updated with + /// the new data in the response to the revalidation request. + Modified(CachePolicy), +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct Request { + uri: String, + method: Method, + headers: RequestHeaders, + unix_timestamp: u64, +} + +impl<'a> From<&'a reqwest::Request> for Request { + fn from(from: &'a reqwest::Request) -> Request { + Request { + uri: from.url().to_string(), + method: Method::from(from.method()), + headers: RequestHeaders::from(from.headers()), + unix_timestamp: unix_timestamp(SystemTime::now()), + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct RequestHeaders { + /// The cache control directives from the `Cache-Control` header. + cc: CacheControl, + /// This is set to `true` only when an `Authorization` header is present. + /// We don't need to record the value. + authorization: bool, +} + +impl<'a> From<&'a http::HeaderMap> for RequestHeaders { + fn from(from: &'a http::HeaderMap) -> RequestHeaders { + RequestHeaders { + cc: from.get_all("cache-control").iter().collect(), + authorization: from.contains_key("authorization"), + } + } +} + +/// The HTTP method used on a request. +/// +/// We don't both representing methods of requests whose responses we won't +/// cache. Instead, we treat them as "unrecognized" and consider the responses +/// not-storable. +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +#[repr(u8)] +enum Method { + Get, + Head, + Unrecognized, +} + +impl<'a> From<&'a http::Method> for Method { + fn from(from: &'a http::Method) -> Method { + if from == http::Method::GET { + Method::Get + } else if from == http::Method::HEAD { + Method::Head + } else { + Method::Unrecognized + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct Response { + status: u16, + headers: ResponseHeaders, + unix_timestamp: u64, +} + +impl ArchivedResponse { + /// Returns the "age" header value on this response, with a fallback of `0` + /// if the header doesn't exist or is invalid, as per [RFC 9111 S4.2.3]. + /// + /// Note that this does not reflect the true "age" of a response. That + /// is computed via `ArchivedCachePolicy::age` as it may need additional + /// information (such as the request time). + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.3 + fn header_age(&self) -> u64 { + self.headers.age_seconds.unwrap_or(0) + } + + /// Returns the "date" header value on this response, with a fallback to + /// the time the response was received as per [RFC 9110 S6.6.1]. + /// + /// [RFC 9110 S6.6.1]: https://www.rfc-editor.org/rfc/rfc9110#section-6.6.1 + fn header_date(&self) -> u64 { + self.headers + .date_unix_timestamp + .unwrap_or(self.unix_timestamp) + } + + /// Returns true when this response has a status code that is considered + /// "final" as per [RFC 9110 S15]. + /// + /// [RFC 9110 S15]: https://www.rfc-editor.org/rfc/rfc9110#section-15 + fn has_final_status(&self) -> bool { + self.status >= 200 + } +} + +impl<'a> From<&'a reqwest::Response> for Response { + fn from(from: &'a reqwest::Response) -> Response { + Response { + status: from.status().as_u16(), + headers: ResponseHeaders::from(from.headers()), + unix_timestamp: unix_timestamp(SystemTime::now()), + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct ResponseHeaders { + /// The directives from the `Cache-Control` header. + cc: CacheControl, + /// The value of the `Age` header corresponding to `age_value` as defined + /// in [RFC 9111 S4.2.3]. If the `Age` header is not present, it should be + /// interpreted at `0`. + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#name-calculating-age + age_seconds: Option, + /// This is `date_value` from [RFC 9111 S4.2.3], which says it corresponds + /// to the `Date` header on a response as defined in [RFC 7231 S7.1.1.2]. + /// In RFC 7231, if the `Date` header is not present, then the recipient + /// should treat its value as equivalent to the time the response was + /// received. In this case, that would be `Response::unix_timestamp`. + /// + /// [RFC 9111 S4.2.3]: https://www.rfc-editor.org/rfc/rfc9111.html#name-calculating-age + /// [RFC 7231 S7.1.1.2]: https://httpwg.org/specs/rfc7231.html#header.date + date_unix_timestamp: Option, + /// This is from the `Expires` header as per [RFC 9111 S5.3]. Note that this + /// is overridden by the presence of either the `max-age` or `s-maxage` cache + /// control directives. + /// + /// If an `Expires` header was present but did not contain a valid RFC 2822 + /// datetime, then this is set to `Some(0)`. (That is, some time in the + /// past, which implies the response has already expired.) + /// + /// [RFC 9111 S5.3]: https://www.rfc-editor.org/rfc/rfc9111.html#section-5.3 + expires_unix_timestamp: Option, + /// The date from the `Last-Modified` header as specified in [RFC 9110 S8.8.2] + /// in RFC 2822 format. It's used to compute a heuristic freshness lifetime for + /// the response when other indicators are missing as per [RFC 9111 S4.2.2]. + /// + /// [RFC 9110 S8.8.2]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.2 + /// [RFC 9111 S4.2.2]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.2.2 + last_modified_unix_timestamp: Option, + /// The "entity tag" from the response as per [RFC 9110 S8.8.3], which is + /// used in revalidation requests. + /// + /// [RFC 9110 S8.8.3]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.3 + etag: Option, +} + +impl<'a> From<&'a http::HeaderMap> for ResponseHeaders { + fn from(from: &'a http::HeaderMap) -> ResponseHeaders { + ResponseHeaders { + cc: from.get_all("cache-control").iter().collect(), + age_seconds: from + .get("age") + .and_then(|header| parse_seconds(header.as_bytes())), + date_unix_timestamp: from + .get("date") + .and_then(|header| header.to_str().ok()) + .and_then(rfc2822_to_unix_timestamp), + expires_unix_timestamp: from + .get("expires") + .and_then(|header| header.to_str().ok()) + .and_then(rfc2822_to_unix_timestamp), + last_modified_unix_timestamp: from + .get("last-modified") + .and_then(|header| header.to_str().ok()) + .and_then(rfc2822_to_unix_timestamp), + etag: from + .get("etag") + .map(|header| ETag::parse(header.as_bytes())), + } + } +} + +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct ETag { + /// The actual ETag validator value. + /// + /// This is received in the response, recorded as part of the cache policy + /// and then sent back in a re-validation request. This is the "best" + /// way for an HTTP server to return an HTTP 304 NOT MODIFIED status, + /// indicating that our cached response is still fresh. + value: Vec, + /// When `weak` is true, this etag is considered a "weak" validator. In + /// effect, it provides weaker semantics than a "strong" validator. As per + /// [RFC 9110 S8.8.1]: + /// + /// "In contrast, a "weak validator" is representation metadata that might + /// not change for every change to the representation data. This weakness + /// might be due to limitations in how the value is calculated (e.g., + /// clock resolution), an inability to ensure uniqueness for all possible + /// representations of the resource, or a desire of the resource owner to + /// group representations by some self-determined set of equivalency rather + /// than unique sequences of data." + /// + /// We don't currently support weak validation. + /// + /// [RFC 9110 S8.8.1]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.1-6 + weak: bool, +} + +impl ETag { + /// Parses an ETag from a header value. + /// + /// We are a little permissive here and allow arbitrary bytes, + /// where as [RFC 9110 S8.8.3] is a bit more restrictive. + /// + /// [RFC 9110 S8.8.3]: https://www.rfc-editor.org/rfc/rfc9110#section-8.8.3 + fn parse(header_value: &[u8]) -> ETag { + let (value, weak) = if header_value.starts_with(b"W/") { + (&header_value[2..], true) + } else { + (header_value, false) + }; + ETag { + value: value.to_vec(), + weak, + } + } +} + +/// Represents the `Vary` header on a cached response, as per [RFC 9110 +/// S12.5.5] and [RFC 9111 S4.1]. +/// +/// This permits responses from the server to express things like, "only used +/// an existing cached response if the request from the client has the same +/// header values for the headers listed in `Vary` as in the original request." +/// +/// [RFC 9110 S12.5.5]: https://www.rfc-editor.org/rfc/rfc9110#section-12.5.5 +/// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct Vary { + fields: Vec, +} + +impl Vary { + /// Returns a `Vary` header value that will never match any request. + fn always_fails_to_match() -> Vary { + Vary { + fields: vec![VaryField { + name: "*".to_string(), + value: vec![], + }], + } + } + + fn from_request_response_headers( + request: &http::HeaderMap, + response: &http::HeaderMap, + ) -> Vary { + // Parses the `Vary` header as per [RFC 9110 S12.5.5]. + // + // [RFC 9110 S12.5.5]: https://www.rfc-editor.org/rfc/rfc9110#section-12.5.5 + let mut fields = vec![]; + for header in response.get_all("vary") { + let Ok(csv) = header.to_str() else { continue }; + for header_name in csv.split(',') { + let header_name = header_name.trim().to_ascii_lowercase(); + // When we see a `*`, that means a failed match is an + // inevitability, regardless of anything else. So just give up + // and return a `Vary` that will never match. + if header_name == "*" { + return Vary::always_fails_to_match(); + } + let value = request + .get(&header_name) + .map(|header| header.as_bytes().to_vec()) + .unwrap_or_default(); + fields.push(VaryField { + name: header_name, + value, + }); + } + } + Vary { fields } + } +} + +impl ArchivedVary { + /// Returns true only when the `Vary` header on a cached response satisfies + /// the request header values given, as per [RFC 9111 S4.1]. + /// + /// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 + fn matches(&self, request_headers: &http::HeaderMap) -> bool { + for field in self.fields.iter() { + // A `*` anywhere means the match always fails. + if field.name == "*" { + return false; + } + let request_header_value = request_headers + .get(field.name.as_str()) + .map_or(&b""[..], |header| header.as_bytes()); + if field.value.as_slice() != request_header_value { + return false; + } + } + true + } +} + +/// A single field and value in a `Vary` header set by the response, +/// as per [RFC 9111 S4.1]. +/// +/// The `name` of the field comes from the `Vary` header in the response, +/// while the value of the field comes from the value of the header with the +/// same `name` in the original request. These field and value pairs are then +/// compared with new incoming requests. If there is a mismatch, then the +/// cached response cannot be used. +/// +/// [RFC 9111 S4.1]: https://www.rfc-editor.org/rfc/rfc9111.html#section-4.1 +#[derive(Debug, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +struct VaryField { + name: String, + value: Vec, +} + +fn unix_timestamp(time: SystemTime) -> u64 { + time.duration_since(SystemTime::UNIX_EPOCH) + .expect("UNIX_EPOCH is as early as it gets") + .as_secs() +} + +fn rfc2822_to_unix_timestamp(s: &str) -> Option { + rfc2822_to_datetime(s).and_then(|dt| u64::try_from(dt.timestamp()).ok()) +} + +fn rfc2822_to_datetime(s: &str) -> Option> { + chrono::DateTime::parse_from_rfc2822(s) + .ok() + .map(|dt| dt.to_utc()) +} + +fn unix_timestamp_to_header(seconds: u64) -> Option { + unix_timestamp_to_rfc2822(seconds).and_then(|string| HeaderValue::from_str(&string).ok()) +} + +fn unix_timestamp_to_rfc2822(seconds: u64) -> Option { + unix_timestamp_to_datetime(seconds).map(|dt| dt.to_rfc2822()) +} + +fn unix_timestamp_to_datetime(seconds: u64) -> Option> { + chrono::DateTime::from_timestamp(i64::try_from(seconds).ok()?, 0) +} + +fn parse_seconds(value: &[u8]) -> Option { + if !value.iter().all(u8::is_ascii_digit) { + return None; + } + std::str::from_utf8(value).ok()?.parse().ok() +} diff --git a/crates/puffin-client/src/lib.rs b/crates/puffin-client/src/lib.rs index ebf20d587..ecc4917d9 100644 --- a/crates/puffin-client/src/lib.rs +++ b/crates/puffin-client/src/lib.rs @@ -7,11 +7,11 @@ pub use registry_client::{ }; pub use rkyvutil::OwnedArchive; -mod cache_headers; mod cached_client; mod error; mod flat_index; mod html; +mod httpcache; mod registry_client; mod remote_metadata; mod rkyvutil; diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index f440b4512..5fdc909b1 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -27,6 +27,7 @@ use pypi_types::{Metadata21, SimpleJson}; use crate::cached_client::CacheControl; use crate::html::SimpleHtml; use crate::remote_metadata::wheel_metadata_from_remote_zip; +use crate::rkyvutil::OwnedArchive; use crate::{CachedClient, CachedClientError, Error, ErrorKind}; /// A builder for an [`RegistryClient`]. @@ -122,7 +123,7 @@ impl RegistryClient { pub async fn simple( &self, package_name: &PackageName, - ) -> Result<(IndexUrl, SimpleMetadata), Error> { + ) -> Result<(IndexUrl, OwnedArchive), Error> { if self.index_urls.no_index() { return Err(ErrorKind::NoIndex(package_name.as_ref().to_string()).into()); } @@ -152,7 +153,7 @@ impl RegistryClient { &self, package_name: &PackageName, index: &IndexUrl, - ) -> Result>, Error> { + ) -> Result, CachedClientError>, Error> { // Format the URL for PyPI. let mut url: Url = index.clone().into(); url.path_segments_mut() @@ -168,7 +169,7 @@ impl RegistryClient { IndexUrl::Pypi => "pypi".to_string(), IndexUrl::Url(url) => cache_key::digest(&cache_key::CanonicalUrl::new(url)), }), - format!("{package_name}.msgpack"), + format!("{package_name}.rkyv"), ); let cache_control = CacheControl::from( self.cache @@ -201,14 +202,14 @@ impl RegistryClient { )) })?; - match media_type { + let unarchived = match media_type { MediaType::Json => { let bytes = response.bytes().await.map_err(ErrorKind::RequestError)?; let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) .map_err(|err| Error::from_json_err(err, url.clone()))?; let metadata = SimpleMetadata::from_files(data.files, package_name, url.as_str()); - Ok(metadata) + metadata } MediaType::Html => { let text = response.text().await.map_err(ErrorKind::RequestError)?; @@ -216,16 +217,17 @@ impl RegistryClient { .map_err(|err| Error::from_html_err(err, url.clone()))?; let metadata = SimpleMetadata::from_files(files, package_name, base.as_url().as_str()); - Ok(metadata) + metadata } - } + }; + OwnedArchive::from_unarchived(&unarchived) } .boxed() .instrument(info_span!("parse_simple_api", package = %package_name)) }; let result = self .client - .get_cached_with_callback( + .get_cacheable( simple_request, &cache_entry, cache_control, @@ -339,7 +341,7 @@ impl RegistryClient { .map_err(ErrorKind::RequestError)?; Ok(self .client - .get_cached_with_callback(req, &cache_entry, cache_control, response_callback) + .get_serde(req, &cache_entry, cache_control, response_callback) .await?) } else { // If we lack PEP 658 support, try using HTTP range requests to read only the @@ -399,7 +401,7 @@ impl RegistryClient { .map_err(ErrorKind::RequestError)?; let result = self .client - .get_cached_with_callback( + .get_serde( req, &cache_entry, cache_control, diff --git a/crates/puffin-dev/src/resolve_many.rs b/crates/puffin-dev/src/resolve_many.rs index 1c37b2b5b..3f510a8fa 100644 --- a/crates/puffin-dev/src/resolve_many.rs +++ b/crates/puffin-dev/src/resolve_many.rs @@ -15,7 +15,7 @@ use pep440_rs::{Version, VersionSpecifier, VersionSpecifiers}; use pep508_rs::{Requirement, VersionOrUrl}; use platform_host::Platform; use puffin_cache::{Cache, CacheArgs}; -use puffin_client::{FlatIndex, RegistryClient, RegistryClientBuilder}; +use puffin_client::{FlatIndex, OwnedArchive, RegistryClient, RegistryClientBuilder}; use puffin_dispatch::BuildDispatch; use puffin_installer::NoBinary; use puffin_interpreter::Virtualenv; @@ -48,7 +48,8 @@ async fn find_latest_version( client: &RegistryClient, package_name: &PackageName, ) -> Option { - let (_, simple_metadata) = client.simple(package_name).await.ok()?; + let (_, raw_simple_metadata) = client.simple(package_name).await.ok()?; + let simple_metadata = OwnedArchive::deserialize(&raw_simple_metadata); let version = simple_metadata.into_iter().next()?.version; Some(version.clone()) } diff --git a/crates/puffin-distribution/src/distribution_database.rs b/crates/puffin-distribution/src/distribution_database.rs index 883d4807c..cf781667b 100644 --- a/crates/puffin-distribution/src/distribution_database.rs +++ b/crates/puffin-distribution/src/distribution_database.rs @@ -177,7 +177,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let archive = self .client .cached_client() - .get_cached_with_callback(req, &http_entry, cache_control, download) + .get_serde(req, &http_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, @@ -240,7 +240,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let archive = self .client .cached_client() - .get_cached_with_callback(req, &http_entry, cache_control, download) + .get_serde(req, &http_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, diff --git a/crates/puffin-distribution/src/source/mod.rs b/crates/puffin-distribution/src/source/mod.rs index d1ad59103..7e3464ae9 100644 --- a/crates/puffin-distribution/src/source/mod.rs +++ b/crates/puffin-distribution/src/source/mod.rs @@ -273,7 +273,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { let req = self.cached_client.uncached().get(url.clone()).build()?; let manifest = self .cached_client - .get_cached_with_callback(req, &cache_entry, cache_control, download) + .get_serde(req, &cache_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, @@ -366,7 +366,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { let req = self.cached_client.uncached().get(url.clone()).build()?; let manifest = self .cached_client - .get_cached_with_callback(req, &cache_entry, cache_control, download) + .get_serde(req, &cache_entry, cache_control, download) .await .map_err(|err| match err { CachedClientError::Callback(err) => err, @@ -941,10 +941,11 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { /// Read an existing HTTP-cached [`Manifest`], if it exists. pub(crate) fn read_http_manifest(cache_entry: &CacheEntry) -> Result, Error> { - match std::fs::read(cache_entry.path()) { - Ok(cached) => Ok(Some( - rmp_serde::from_slice::>(&cached)?.data, - )), + match std::fs::File::open(cache_entry.path()) { + Ok(file) => { + let data = DataWithCachePolicy::from_reader(file)?.data; + Ok(Some(rmp_serde::from_slice::(&data)?)) + } Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), Err(err) => Err(Error::CacheRead(err)), } diff --git a/crates/puffin-resolver/Cargo.toml b/crates/puffin-resolver/Cargo.toml index a716a75ea..4f87cabfb 100644 --- a/crates/puffin-resolver/Cargo.toml +++ b/crates/puffin-resolver/Cargo.toml @@ -40,7 +40,6 @@ dashmap = { workspace = true } derivative = { workspace = true } fs-err = { workspace = true, features = ["tokio"] } futures = { workspace = true } -http-cache-semantics = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } once_cell = { workspace = true } diff --git a/crates/puffin-resolver/src/finder.rs b/crates/puffin-resolver/src/finder.rs index 33412cdfb..93bb8ac77 100644 --- a/crates/puffin-resolver/src/finder.rs +++ b/crates/puffin-resolver/src/finder.rs @@ -12,7 +12,7 @@ use distribution_types::{Dist, IndexUrl, Resolution}; use pep508_rs::{Requirement, VersionOrUrl}; use platform_tags::Tags; use puffin_client::{ - FlatDistributions, FlatIndex, RegistryClient, SimpleMetadata, SimpleMetadatum, + FlatDistributions, FlatIndex, OwnedArchive, RegistryClient, SimpleMetadata, SimpleMetadatum, }; use puffin_interpreter::Interpreter; use puffin_normalize::PackageName; @@ -67,7 +67,8 @@ impl<'a> DistFinder<'a> { match requirement.version_or_url.as_ref() { None | Some(VersionOrUrl::VersionSpecifier(_)) => { // Query the index(es) (cached) to get the URLs for the available files. - let (index, metadata) = self.client.simple(&requirement.name).await?; + let (index, raw_metadata) = self.client.simple(&requirement.name).await?; + let metadata = OwnedArchive::deserialize(&raw_metadata); // Pick a version that satisfies the requirement. let Some(dist) = self.select(requirement, metadata, &index, flat_index) else { diff --git a/crates/puffin-resolver/src/version_map.rs b/crates/puffin-resolver/src/version_map.rs index 2838deffa..8d32f0048 100644 --- a/crates/puffin-resolver/src/version_map.rs +++ b/crates/puffin-resolver/src/version_map.rs @@ -8,7 +8,7 @@ use distribution_filename::DistFilename; use distribution_types::{Dist, IndexUrl, PrioritizedDistribution, ResolvableDist}; use pep440_rs::Version; use platform_tags::Tags; -use puffin_client::{FlatDistributions, SimpleMetadata, SimpleMetadatum}; +use puffin_client::{FlatDistributions, OwnedArchive, SimpleMetadata, SimpleMetadatum}; use puffin_normalize::PackageName; use puffin_traits::NoBinary; use puffin_warnings::warn_user_once; @@ -26,7 +26,7 @@ impl VersionMap { #[instrument(skip_all, fields(package_name))] #[allow(clippy::too_many_arguments)] pub(crate) fn from_metadata( - metadata: SimpleMetadata, + raw_metadata: OwnedArchive, package_name: &PackageName, index: &IndexUrl, tags: &Tags, @@ -36,6 +36,15 @@ impl VersionMap { flat_index: Option, no_binary: &NoBinary, ) -> Self { + // NOTE: We should experiment with refactoring the code + // below to work on rkyv::Archived. More + // specifically, we may want to adjust VersionMap itself to + // contain an Archived of some kind, that in + // turn is used in the resolver. The idea here is to avoid + // eagerly deserializing all of the metadata for a package + // up-front. + let metadata = OwnedArchive::deserialize(&raw_metadata); + // If we have packages of the same name from find links, gives them priority, otherwise start empty let mut version_map: BTreeMap = flat_index.map(Into::into).unwrap_or_default();