From 561625ed8cb033b35a6110f91b68bc3e9c918785 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Fri, 26 Jul 2024 19:24:09 -0400 Subject: [PATCH] Use hasher to compute resolution hash (#5495) ## Summary Addressing one TODO. This should be more efficient. --- crates/cache-key/src/digest.rs | 34 +++++++++++++------ crates/cache-key/src/lib.rs | 2 +- crates/distribution-types/src/any.rs | 1 + crates/distribution-types/src/file.rs | 11 +++--- crates/distribution-types/src/installed.rs | 12 +++---- crates/distribution-types/src/lib.rs | 24 ++++++------- crates/distribution-types/src/resolution.rs | 2 +- crates/distribution-types/src/resolved.rs | 3 +- crates/pypi-types/src/direct_url.rs | 14 ++++---- crates/pypi-types/src/simple_json.rs | 1 + crates/uv-cache/src/wheel.rs | 12 +++---- crates/uv-client/src/flat_index.rs | 2 +- crates/uv-git/src/resolver.rs | 2 +- crates/uv-git/src/source.rs | 4 +-- crates/uv-normalize/src/package_name.rs | 2 +- crates/uv-python/src/environment.rs | 2 +- crates/uv-python/src/interpreter.rs | 4 +-- crates/uv/src/commands/project/environment.rs | 15 +++----- 18 files changed, 78 insertions(+), 69 deletions(-) diff --git a/crates/cache-key/src/digest.rs b/crates/cache-key/src/digest.rs index 54f7cb204..8a74ce018 100644 --- a/crates/cache-key/src/digest.rs +++ b/crates/cache-key/src/digest.rs @@ -1,22 +1,34 @@ -use std::hash::Hasher; - use crate::cache_key::{CacheKey, CacheKeyHasher}; +use seahash::SeaHasher; +use std::hash::{Hash, Hasher}; /// Compute a hex string hash of a `CacheKey` object. /// -/// The value returned by [`digest`] should be stable across releases and platforms. -pub fn digest(hashable: &H) -> String { +/// The value returned by [`cache_digest`] should be stable across releases and platforms. +pub fn cache_digest(hashable: &H) -> String { + /// Compute a u64 hash of a [`CacheKey`] object. + fn cache_key_u64(hashable: &H) -> u64 { + let mut hasher = CacheKeyHasher::new(); + hashable.cache_key(&mut hasher); + hasher.finish() + } + to_hex(cache_key_u64(hashable)) } +/// Compute a hex string hash of a hashable object. +pub fn hash_digest(hashable: &H) -> String { + /// Compute a u64 hash of a hashable object. + fn hash_u64(hashable: &H) -> u64 { + let mut hasher = SeaHasher::new(); + hashable.hash(&mut hasher); + hasher.finish() + } + + to_hex(hash_u64(hashable)) +} + /// Convert a u64 to a hex string. fn to_hex(num: u64) -> String { hex::encode(num.to_le_bytes()) } - -/// Compute a u64 hash of a [`CacheKey`] object. -fn cache_key_u64(hashable: &H) -> u64 { - let mut hasher = CacheKeyHasher::new(); - hashable.cache_key(&mut hasher); - hasher.finish() -} diff --git a/crates/cache-key/src/lib.rs b/crates/cache-key/src/lib.rs index d7aa69929..8ada05579 100644 --- a/crates/cache-key/src/lib.rs +++ b/crates/cache-key/src/lib.rs @@ -1,6 +1,6 @@ pub use cache_key::{CacheKey, CacheKeyHasher}; pub use canonical_url::{CanonicalUrl, RepositoryUrl}; -pub use digest::digest; +pub use digest::{cache_digest, hash_digest}; mod cache_key; mod canonical_url; diff --git a/crates/distribution-types/src/any.rs b/crates/distribution-types/src/any.rs index c5657ca43..b0b8f10a9 100644 --- a/crates/distribution-types/src/any.rs +++ b/crates/distribution-types/src/any.rs @@ -6,6 +6,7 @@ use crate::{InstalledMetadata, InstalledVersion, Name}; /// A distribution which is either installable, is a wheel in our cache or is already installed. #[derive(Debug, Clone)] +#[allow(clippy::large_enum_variant)] pub enum LocalDist { Cached(CachedDist), Installed(InstalledDist), diff --git a/crates/distribution-types/src/file.rs b/crates/distribution-types/src/file.rs index 2674ef114..84f434cac 100644 --- a/crates/distribution-types/src/file.rs +++ b/crates/distribution-types/src/file.rs @@ -19,7 +19,7 @@ pub enum FileConversionError { /// Internal analog to [`pypi_types::File`]. #[derive( - Debug, Clone, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize, + Debug, Clone, Hash, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize, )] #[archive(check_bytes)] #[archive_attr(derive(Debug))] @@ -67,7 +67,7 @@ impl File { /// While a registry file is generally a remote URL, it can also be a file if it comes from a directory flat indexes. #[derive( - Debug, Clone, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize, + Debug, Clone, Hash, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize, )] #[archive(check_bytes)] #[archive_attr(derive(Debug))] @@ -147,13 +147,14 @@ impl Display for FileLocation { #[derive( Debug, Clone, + PartialEq, + Eq, + Hash, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize, - PartialEq, - Eq, )] #[archive(check_bytes)] #[archive_attr(derive(Debug))] @@ -185,7 +186,7 @@ impl From for String { } } -impl fmt::Display for UrlString { +impl Display for UrlString { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) } diff --git a/crates/distribution-types/src/installed.rs b/crates/distribution-types/src/installed.rs index 4186b8f3c..e20736953 100644 --- a/crates/distribution-types/src/installed.rs +++ b/crates/distribution-types/src/installed.rs @@ -16,7 +16,7 @@ use uv_normalize::PackageName; use crate::{DistributionMetadata, InstalledMetadata, InstalledVersion, Name, VersionOrUrlRef}; /// A built distribution (wheel) that is installed in a virtual environment. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub enum InstalledDist { /// The distribution was derived from a registry, like `PyPI`. Registry(InstalledRegistryDist), @@ -30,14 +30,14 @@ pub enum InstalledDist { LegacyEditable(InstalledLegacyEditable), } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct InstalledRegistryDist { pub name: PackageName, pub version: Version, pub path: PathBuf, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct InstalledDirectUrlDist { pub name: PackageName, pub version: Version, @@ -47,21 +47,21 @@ pub struct InstalledDirectUrlDist { pub path: PathBuf, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct InstalledEggInfoFile { pub name: PackageName, pub version: Version, pub path: PathBuf, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct InstalledEggInfoDirectory { pub name: PackageName, pub version: Version, pub path: PathBuf, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct InstalledLegacyEditable { pub name: PackageName, pub version: Version, diff --git a/crates/distribution-types/src/lib.rs b/crates/distribution-types/src/lib.rs index dbcdff606..edd5a3313 100644 --- a/crates/distribution-types/src/lib.rs +++ b/crates/distribution-types/src/lib.rs @@ -126,14 +126,14 @@ impl std::fmt::Display for InstalledVersion<'_> { /// Either a built distribution, a wheel, or a source distribution that exists at some location. /// /// The location can be an index, URL or path (wheel), or index, URL, path or Git repository (source distribution). -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub enum Dist { Built(BuiltDist), Source(SourceDist), } /// A wheel, with its three possible origins (index, url, path) -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] #[allow(clippy::large_enum_variant)] pub enum BuiltDist { Registry(RegistryBuiltDist), @@ -142,7 +142,7 @@ pub enum BuiltDist { } /// A source distribution, with its possible origins (index, url, path, git) -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] #[allow(clippy::large_enum_variant)] pub enum SourceDist { Registry(RegistrySourceDist), @@ -153,7 +153,7 @@ pub enum SourceDist { } /// A built distribution (wheel) that exists in a registry, like `PyPI`. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct RegistryBuiltWheel { pub filename: WheelFilename, pub file: Box, @@ -161,7 +161,7 @@ pub struct RegistryBuiltWheel { } /// A built distribution (wheel) that exists in a registry, like `PyPI`. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct RegistryBuiltDist { /// All wheels associated with this distribution. It is guaranteed /// that there is at least one wheel. @@ -191,7 +191,7 @@ pub struct RegistryBuiltDist { } /// A built distribution (wheel) that exists at an arbitrary URL. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct DirectUrlBuiltDist { /// We require that wheel urls end in the full wheel filename, e.g. /// `https://example.org/packages/flask-3.0.0-py3-none-any.whl` @@ -203,7 +203,7 @@ pub struct DirectUrlBuiltDist { } /// A built distribution (wheel) that exists in a local directory. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct PathBuiltDist { pub filename: WheelFilename, /// The path to the wheel. @@ -213,7 +213,7 @@ pub struct PathBuiltDist { } /// A source distribution that exists in a registry, like `PyPI`. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct RegistrySourceDist { pub name: PackageName, pub version: Version, @@ -230,7 +230,7 @@ pub struct RegistrySourceDist { } /// A source distribution that exists at an arbitrary URL. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct DirectUrlSourceDist { /// Unlike [`DirectUrlBuiltDist`], we can't require a full filename with a version here, people /// like using e.g. `foo @ https://github.com/org/repo/archive/master.zip` @@ -244,7 +244,7 @@ pub struct DirectUrlSourceDist { } /// A source distribution that exists in a Git repository. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct GitSourceDist { pub name: PackageName, /// The URL without the revision and subdirectory fragment. @@ -256,7 +256,7 @@ pub struct GitSourceDist { } /// A source distribution that exists in a local archive (e.g., a `.tar.gz` file). -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct PathSourceDist { pub name: PackageName, /// The resolved, absolute path to the distribution which we use for installing. @@ -270,7 +270,7 @@ pub struct PathSourceDist { } /// A source distribution that exists in a local directory. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct DirectorySourceDist { pub name: PackageName, /// The resolved, absolute path to the distribution which we use for installing. diff --git a/crates/distribution-types/src/resolution.rs b/crates/distribution-types/src/resolution.rs index b4bec48f4..b2191b147 100644 --- a/crates/distribution-types/src/resolution.rs +++ b/crates/distribution-types/src/resolution.rs @@ -71,7 +71,7 @@ impl Resolution { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub enum ResolutionDiagnostic { MissingExtra { /// The distribution that was requested with a non-existent extra. For example, diff --git a/crates/distribution-types/src/resolved.rs b/crates/distribution-types/src/resolved.rs index 5a95d474e..c260d3250 100644 --- a/crates/distribution-types/src/resolved.rs +++ b/crates/distribution-types/src/resolved.rs @@ -12,7 +12,8 @@ use crate::{ /// A distribution that can be used for resolution and installation. /// /// Either an already-installed distribution or a distribution that can be installed. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] +#[allow(clippy::large_enum_variant)] pub enum ResolvedDist { Installed(InstalledDist), Installable(Dist), diff --git a/crates/pypi-types/src/direct_url.rs b/crates/pypi-types/src/direct_url.rs index b31ea4617..da2fbc65f 100644 --- a/crates/pypi-types/src/direct_url.rs +++ b/crates/pypi-types/src/direct_url.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::BTreeMap; use std::path::PathBuf; use serde::{Deserialize, Serialize}; @@ -7,7 +7,7 @@ use url::Url; /// Metadata for a distribution that was installed via a direct URL. /// /// See: -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case", untagged)] pub enum DirectUrl { /// The direct URL is a local directory. For example: @@ -41,23 +41,23 @@ pub enum DirectUrl { }, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub struct DirInfo { #[serde(skip_serializing_if = "Option::is_none")] pub editable: Option, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub struct ArchiveInfo { #[serde(skip_serializing_if = "Option::is_none")] pub hash: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub hashes: Option>, + pub hashes: Option>, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub struct VcsInfo { pub vcs: VcsKind, @@ -67,7 +67,7 @@ pub struct VcsInfo { pub requested_revision: Option, } -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum VcsKind { Git, diff --git a/crates/pypi-types/src/simple_json.rs b/crates/pypi-types/src/simple_json.rs index 61690e90f..aed242159 100644 --- a/crates/pypi-types/src/simple_json.rs +++ b/crates/pypi-types/src/simple_json.rs @@ -92,6 +92,7 @@ impl CoreMetadata { Clone, PartialEq, Eq, + Hash, Serialize, Deserialize, rkyv::Archive, diff --git a/crates/uv-cache/src/wheel.rs b/crates/uv-cache/src/wheel.rs index 38619b837..02d45e89a 100644 --- a/crates/uv-cache/src/wheel.rs +++ b/crates/uv-cache/src/wheel.rs @@ -2,7 +2,7 @@ use std::path::{Path, PathBuf}; use url::Url; -use cache_key::{digest, CanonicalUrl}; +use cache_key::{cache_digest, CanonicalUrl}; use distribution_types::IndexUrl; /// Cache wheels and their metadata, both from remote wheels and built from source distributions. @@ -30,19 +30,19 @@ impl<'a> WheelCache<'a> { WheelCache::Index(IndexUrl::Pypi(_)) => WheelCacheKind::Pypi.root(), WheelCache::Index(url) => WheelCacheKind::Index .root() - .join(digest(&CanonicalUrl::new(url))), + .join(cache_digest(&CanonicalUrl::new(url))), WheelCache::Url(url) => WheelCacheKind::Url .root() - .join(digest(&CanonicalUrl::new(url))), + .join(cache_digest(&CanonicalUrl::new(url))), WheelCache::Path(url) => WheelCacheKind::Path .root() - .join(digest(&CanonicalUrl::new(url))), + .join(cache_digest(&CanonicalUrl::new(url))), WheelCache::Editable(url) => WheelCacheKind::Editable .root() - .join(digest(&CanonicalUrl::new(url))), + .join(cache_digest(&CanonicalUrl::new(url))), WheelCache::Git(url, sha) => WheelCacheKind::Git .root() - .join(digest(&CanonicalUrl::new(url))) + .join(cache_digest(&CanonicalUrl::new(url))) .join(sha), } } diff --git a/crates/uv-client/src/flat_index.rs b/crates/uv-client/src/flat_index.rs index 5f7be7aad..1997b0952 100644 --- a/crates/uv-client/src/flat_index.rs +++ b/crates/uv-client/src/flat_index.rs @@ -141,7 +141,7 @@ impl<'a> FlatIndexClient<'a> { let cache_entry = self.cache.entry( CacheBucket::FlatIndex, "html", - format!("{}.msgpack", cache_key::digest(&url.to_string())), + format!("{}.msgpack", cache_key::cache_digest(&url.to_string())), ); let cache_control = match self.client.connectivity() { Connectivity::Online => CacheControl::from( diff --git a/crates/uv-git/src/resolver.rs b/crates/uv-git/src/resolver.rs index 75cc8150a..82357c67d 100644 --- a/crates/uv-git/src/resolver.rs +++ b/crates/uv-git/src/resolver.rs @@ -63,7 +63,7 @@ impl GitResolver { fs::create_dir_all(&lock_dir).await?; let repository_url = RepositoryUrl::new(url.repository()); let _lock = LockedFile::acquire( - lock_dir.join(cache_key::digest(&repository_url)), + lock_dir.join(cache_key::cache_digest(&repository_url)), &repository_url, )?; diff --git a/crates/uv-git/src/source.rs b/crates/uv-git/src/source.rs index e33540313..dec18f25a 100644 --- a/crates/uv-git/src/source.rs +++ b/crates/uv-git/src/source.rs @@ -8,7 +8,7 @@ use reqwest_middleware::ClientWithMiddleware; use tracing::{debug, instrument}; use url::Url; -use cache_key::{digest, RepositoryUrl}; +use cache_key::{cache_digest, RepositoryUrl}; use crate::git::GitRemote; use crate::{GitOid, GitSha, GitUrl}; @@ -53,7 +53,7 @@ impl GitSource { #[instrument(skip(self), fields(repository = %self.git.repository, rev = ?self.git.precise))] pub fn fetch(self) -> Result { // The path to the repo, within the Git database. - let ident = digest(&RepositoryUrl::new(&self.git.repository)); + let ident = cache_digest(&RepositoryUrl::new(&self.git.repository)); let db_path = self.cache.join("db").join(&ident); let remote = GitRemote::new(&self.git.repository); diff --git a/crates/uv-normalize/src/package_name.rs b/crates/uv-normalize/src/package_name.rs index baaa0dc6d..31048474c 100644 --- a/crates/uv-normalize/src/package_name.rs +++ b/crates/uv-normalize/src/package_name.rs @@ -17,9 +17,9 @@ use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNam Clone, PartialEq, Eq, - Hash, PartialOrd, Ord, + Hash, Serialize, rkyv::Archive, rkyv::Deserialize, diff --git a/crates/uv-python/src/environment.rs b/crates/uv-python/src/environment.rs index 08618535b..a7fa009e8 100644 --- a/crates/uv-python/src/environment.rs +++ b/crates/uv-python/src/environment.rs @@ -197,7 +197,7 @@ impl PythonEnvironment { } else { // Otherwise, use a global lockfile. LockedFile::acquire( - env::temp_dir().join(format!("uv-{}.lock", cache_key::digest(&self.0.root))), + env::temp_dir().join(format!("uv-{}.lock", cache_key::cache_digest(&self.0.root))), self.0.root.user_display(), ) } diff --git a/crates/uv-python/src/interpreter.rs b/crates/uv-python/src/interpreter.rs index 693e7d721..3172e526b 100644 --- a/crates/uv-python/src/interpreter.rs +++ b/crates/uv-python/src/interpreter.rs @@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize}; use thiserror::Error; use tracing::{trace, warn}; -use cache_key::digest; +use cache_key::cache_digest; use install_wheel_rs::Layout; use pep440_rs::Version; use pep508_rs::{MarkerEnvironment, StringVersion}; @@ -716,7 +716,7 @@ impl InterpreterInfo { "", // We use the absolute path for the cache entry to avoid cache collisions for relative // paths. But we don't to query the executable with symbolic links resolved. - format!("{}.msgpack", digest(&absolute)), + format!("{}.msgpack", cache_digest(&absolute)), ); // We check the timestamp of the canonicalized executable to check if an underlying diff --git a/crates/uv/src/commands/project/environment.rs b/crates/uv/src/commands/project/environment.rs index bfc7b0ef1..2d4f8aaab 100644 --- a/crates/uv/src/commands/project/environment.rs +++ b/crates/uv/src/commands/project/environment.rs @@ -1,7 +1,6 @@ -use itertools::Itertools; use tracing::debug; -use cache_key::digest; +use cache_key::{cache_digest, hash_digest}; use distribution_types::Resolution; use uv_cache::{Cache, CacheBucket}; use uv_client::Connectivity; @@ -75,18 +74,12 @@ impl CachedEnvironment { // Hash the resolution by hashing the generated lockfile. // TODO(charlie): If the resolution contains any mutable metadata (like a path or URL // dependency), skip this step. - // TODO(charlie): Consider implementing `CacheKey` for `Resolution`. - let resolution_hash = digest( - &resolution - .distributions() - .map(std::string::ToString::to_string) - .join("\n") - .as_bytes(), - ); + let distributions = resolution.distributions().collect::>(); + let resolution_hash = hash_digest(&distributions); // Hash the interpreter based on its path. // TODO(charlie): Come up with a robust hash for the interpreter. - let interpreter_hash = digest(&interpreter.sys_executable()); + let interpreter_hash = cache_digest(&interpreter.sys_executable()); // Search in the content-addressed cache. let cache_entry = cache.entry(CacheBucket::Environments, interpreter_hash, resolution_hash);