Use hasher to compute resolution hash (#5495)

## Summary

Addressing one TODO. This should be more efficient.
This commit is contained in:
Charlie Marsh 2024-07-26 19:24:09 -04:00 committed by GitHub
parent e8d7c0cb58
commit 561625ed8c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 78 additions and 69 deletions

View file

@ -1,22 +1,34 @@
use std::hash::Hasher;
use crate::cache_key::{CacheKey, CacheKeyHasher};
use seahash::SeaHasher;
use std::hash::{Hash, Hasher};
/// Compute a hex string hash of a `CacheKey` object.
///
/// The value returned by [`digest`] should be stable across releases and platforms.
pub fn digest<H: CacheKey>(hashable: &H) -> String {
/// The value returned by [`cache_digest`] should be stable across releases and platforms.
pub fn cache_digest<H: CacheKey>(hashable: &H) -> String {
/// Compute a u64 hash of a [`CacheKey`] object.
fn cache_key_u64<H: CacheKey>(hashable: &H) -> u64 {
let mut hasher = CacheKeyHasher::new();
hashable.cache_key(&mut hasher);
hasher.finish()
}
to_hex(cache_key_u64(hashable))
}
/// Compute a hex string hash of a hashable object.
pub fn hash_digest<H: Hash>(hashable: &H) -> String {
/// Compute a u64 hash of a hashable object.
fn hash_u64<H: Hash>(hashable: &H) -> u64 {
let mut hasher = SeaHasher::new();
hashable.hash(&mut hasher);
hasher.finish()
}
to_hex(hash_u64(hashable))
}
/// Convert a u64 to a hex string.
fn to_hex(num: u64) -> String {
hex::encode(num.to_le_bytes())
}
/// Compute a u64 hash of a [`CacheKey`] object.
fn cache_key_u64<H: CacheKey>(hashable: &H) -> u64 {
let mut hasher = CacheKeyHasher::new();
hashable.cache_key(&mut hasher);
hasher.finish()
}

View file

@ -1,6 +1,6 @@
pub use cache_key::{CacheKey, CacheKeyHasher};
pub use canonical_url::{CanonicalUrl, RepositoryUrl};
pub use digest::digest;
pub use digest::{cache_digest, hash_digest};
mod cache_key;
mod canonical_url;

View file

@ -6,6 +6,7 @@ use crate::{InstalledMetadata, InstalledVersion, Name};
/// A distribution which is either installable, is a wheel in our cache or is already installed.
#[derive(Debug, Clone)]
#[allow(clippy::large_enum_variant)]
pub enum LocalDist {
Cached(CachedDist),
Installed(InstalledDist),

View file

@ -19,7 +19,7 @@ pub enum FileConversionError {
/// Internal analog to [`pypi_types::File`].
#[derive(
Debug, Clone, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize,
Debug, Clone, Hash, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
@ -67,7 +67,7 @@ impl File {
/// While a registry file is generally a remote URL, it can also be a file if it comes from a directory flat indexes.
#[derive(
Debug, Clone, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize,
Debug, Clone, Hash, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
@ -147,13 +147,14 @@ impl Display for FileLocation {
#[derive(
Debug,
Clone,
PartialEq,
Eq,
Hash,
Serialize,
Deserialize,
rkyv::Archive,
rkyv::Deserialize,
rkyv::Serialize,
PartialEq,
Eq,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
@ -185,7 +186,7 @@ impl From<UrlString> for String {
}
}
impl fmt::Display for UrlString {
impl Display for UrlString {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.0, f)
}

View file

@ -16,7 +16,7 @@ use uv_normalize::PackageName;
use crate::{DistributionMetadata, InstalledMetadata, InstalledVersion, Name, VersionOrUrlRef};
/// A built distribution (wheel) that is installed in a virtual environment.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub enum InstalledDist {
/// The distribution was derived from a registry, like `PyPI`.
Registry(InstalledRegistryDist),
@ -30,14 +30,14 @@ pub enum InstalledDist {
LegacyEditable(InstalledLegacyEditable),
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct InstalledRegistryDist {
pub name: PackageName,
pub version: Version,
pub path: PathBuf,
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct InstalledDirectUrlDist {
pub name: PackageName,
pub version: Version,
@ -47,21 +47,21 @@ pub struct InstalledDirectUrlDist {
pub path: PathBuf,
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct InstalledEggInfoFile {
pub name: PackageName,
pub version: Version,
pub path: PathBuf,
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct InstalledEggInfoDirectory {
pub name: PackageName,
pub version: Version,
pub path: PathBuf,
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct InstalledLegacyEditable {
pub name: PackageName,
pub version: Version,

View file

@ -126,14 +126,14 @@ impl std::fmt::Display for InstalledVersion<'_> {
/// Either a built distribution, a wheel, or a source distribution that exists at some location.
///
/// The location can be an index, URL or path (wheel), or index, URL, path or Git repository (source distribution).
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub enum Dist {
Built(BuiltDist),
Source(SourceDist),
}
/// A wheel, with its three possible origins (index, url, path)
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
#[allow(clippy::large_enum_variant)]
pub enum BuiltDist {
Registry(RegistryBuiltDist),
@ -142,7 +142,7 @@ pub enum BuiltDist {
}
/// A source distribution, with its possible origins (index, url, path, git)
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
#[allow(clippy::large_enum_variant)]
pub enum SourceDist {
Registry(RegistrySourceDist),
@ -153,7 +153,7 @@ pub enum SourceDist {
}
/// A built distribution (wheel) that exists in a registry, like `PyPI`.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct RegistryBuiltWheel {
pub filename: WheelFilename,
pub file: Box<File>,
@ -161,7 +161,7 @@ pub struct RegistryBuiltWheel {
}
/// A built distribution (wheel) that exists in a registry, like `PyPI`.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct RegistryBuiltDist {
/// All wheels associated with this distribution. It is guaranteed
/// that there is at least one wheel.
@ -191,7 +191,7 @@ pub struct RegistryBuiltDist {
}
/// A built distribution (wheel) that exists at an arbitrary URL.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct DirectUrlBuiltDist {
/// We require that wheel urls end in the full wheel filename, e.g.
/// `https://example.org/packages/flask-3.0.0-py3-none-any.whl`
@ -203,7 +203,7 @@ pub struct DirectUrlBuiltDist {
}
/// A built distribution (wheel) that exists in a local directory.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct PathBuiltDist {
pub filename: WheelFilename,
/// The path to the wheel.
@ -213,7 +213,7 @@ pub struct PathBuiltDist {
}
/// A source distribution that exists in a registry, like `PyPI`.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct RegistrySourceDist {
pub name: PackageName,
pub version: Version,
@ -230,7 +230,7 @@ pub struct RegistrySourceDist {
}
/// A source distribution that exists at an arbitrary URL.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct DirectUrlSourceDist {
/// Unlike [`DirectUrlBuiltDist`], we can't require a full filename with a version here, people
/// like using e.g. `foo @ https://github.com/org/repo/archive/master.zip`
@ -244,7 +244,7 @@ pub struct DirectUrlSourceDist {
}
/// A source distribution that exists in a Git repository.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct GitSourceDist {
pub name: PackageName,
/// The URL without the revision and subdirectory fragment.
@ -256,7 +256,7 @@ pub struct GitSourceDist {
}
/// A source distribution that exists in a local archive (e.g., a `.tar.gz` file).
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct PathSourceDist {
pub name: PackageName,
/// The resolved, absolute path to the distribution which we use for installing.
@ -270,7 +270,7 @@ pub struct PathSourceDist {
}
/// A source distribution that exists in a local directory.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub struct DirectorySourceDist {
pub name: PackageName,
/// The resolved, absolute path to the distribution which we use for installing.

View file

@ -71,7 +71,7 @@ impl Resolution {
}
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
pub enum ResolutionDiagnostic {
MissingExtra {
/// The distribution that was requested with a non-existent extra. For example,

View file

@ -12,7 +12,8 @@ use crate::{
/// A distribution that can be used for resolution and installation.
///
/// Either an already-installed distribution or a distribution that can be installed.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Hash)]
#[allow(clippy::large_enum_variant)]
pub enum ResolvedDist {
Installed(InstalledDist),
Installable(Dist),

View file

@ -1,4 +1,4 @@
use std::collections::HashMap;
use std::collections::BTreeMap;
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
@ -7,7 +7,7 @@ use url::Url;
/// Metadata for a distribution that was installed via a direct URL.
///
/// See: <https://packaging.python.org/en/latest/specifications/direct-url-data-structure/>
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case", untagged)]
pub enum DirectUrl {
/// The direct URL is a local directory. For example:
@ -41,23 +41,23 @@ pub enum DirectUrl {
},
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct DirInfo {
#[serde(skip_serializing_if = "Option::is_none")]
pub editable: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct ArchiveInfo {
#[serde(skip_serializing_if = "Option::is_none")]
pub hash: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub hashes: Option<HashMap<String, String>>,
pub hashes: Option<BTreeMap<String, String>>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct VcsInfo {
pub vcs: VcsKind,
@ -67,7 +67,7 @@ pub struct VcsInfo {
pub requested_revision: Option<String>,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum VcsKind {
Git,

View file

@ -92,6 +92,7 @@ impl CoreMetadata {
Clone,
PartialEq,
Eq,
Hash,
Serialize,
Deserialize,
rkyv::Archive,

View file

@ -2,7 +2,7 @@ use std::path::{Path, PathBuf};
use url::Url;
use cache_key::{digest, CanonicalUrl};
use cache_key::{cache_digest, CanonicalUrl};
use distribution_types::IndexUrl;
/// Cache wheels and their metadata, both from remote wheels and built from source distributions.
@ -30,19 +30,19 @@ impl<'a> WheelCache<'a> {
WheelCache::Index(IndexUrl::Pypi(_)) => WheelCacheKind::Pypi.root(),
WheelCache::Index(url) => WheelCacheKind::Index
.root()
.join(digest(&CanonicalUrl::new(url))),
.join(cache_digest(&CanonicalUrl::new(url))),
WheelCache::Url(url) => WheelCacheKind::Url
.root()
.join(digest(&CanonicalUrl::new(url))),
.join(cache_digest(&CanonicalUrl::new(url))),
WheelCache::Path(url) => WheelCacheKind::Path
.root()
.join(digest(&CanonicalUrl::new(url))),
.join(cache_digest(&CanonicalUrl::new(url))),
WheelCache::Editable(url) => WheelCacheKind::Editable
.root()
.join(digest(&CanonicalUrl::new(url))),
.join(cache_digest(&CanonicalUrl::new(url))),
WheelCache::Git(url, sha) => WheelCacheKind::Git
.root()
.join(digest(&CanonicalUrl::new(url)))
.join(cache_digest(&CanonicalUrl::new(url)))
.join(sha),
}
}

View file

@ -141,7 +141,7 @@ impl<'a> FlatIndexClient<'a> {
let cache_entry = self.cache.entry(
CacheBucket::FlatIndex,
"html",
format!("{}.msgpack", cache_key::digest(&url.to_string())),
format!("{}.msgpack", cache_key::cache_digest(&url.to_string())),
);
let cache_control = match self.client.connectivity() {
Connectivity::Online => CacheControl::from(

View file

@ -63,7 +63,7 @@ impl GitResolver {
fs::create_dir_all(&lock_dir).await?;
let repository_url = RepositoryUrl::new(url.repository());
let _lock = LockedFile::acquire(
lock_dir.join(cache_key::digest(&repository_url)),
lock_dir.join(cache_key::cache_digest(&repository_url)),
&repository_url,
)?;

View file

@ -8,7 +8,7 @@ use reqwest_middleware::ClientWithMiddleware;
use tracing::{debug, instrument};
use url::Url;
use cache_key::{digest, RepositoryUrl};
use cache_key::{cache_digest, RepositoryUrl};
use crate::git::GitRemote;
use crate::{GitOid, GitSha, GitUrl};
@ -53,7 +53,7 @@ impl GitSource {
#[instrument(skip(self), fields(repository = %self.git.repository, rev = ?self.git.precise))]
pub fn fetch(self) -> Result<Fetch> {
// The path to the repo, within the Git database.
let ident = digest(&RepositoryUrl::new(&self.git.repository));
let ident = cache_digest(&RepositoryUrl::new(&self.git.repository));
let db_path = self.cache.join("db").join(&ident);
let remote = GitRemote::new(&self.git.repository);

View file

@ -17,9 +17,9 @@ use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNam
Clone,
PartialEq,
Eq,
Hash,
PartialOrd,
Ord,
Hash,
Serialize,
rkyv::Archive,
rkyv::Deserialize,

View file

@ -197,7 +197,7 @@ impl PythonEnvironment {
} else {
// Otherwise, use a global lockfile.
LockedFile::acquire(
env::temp_dir().join(format!("uv-{}.lock", cache_key::digest(&self.0.root))),
env::temp_dir().join(format!("uv-{}.lock", cache_key::cache_digest(&self.0.root))),
self.0.root.user_display(),
)
}

View file

@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize};
use thiserror::Error;
use tracing::{trace, warn};
use cache_key::digest;
use cache_key::cache_digest;
use install_wheel_rs::Layout;
use pep440_rs::Version;
use pep508_rs::{MarkerEnvironment, StringVersion};
@ -716,7 +716,7 @@ impl InterpreterInfo {
"",
// We use the absolute path for the cache entry to avoid cache collisions for relative
// paths. But we don't to query the executable with symbolic links resolved.
format!("{}.msgpack", digest(&absolute)),
format!("{}.msgpack", cache_digest(&absolute)),
);
// We check the timestamp of the canonicalized executable to check if an underlying

View file

@ -1,7 +1,6 @@
use itertools::Itertools;
use tracing::debug;
use cache_key::digest;
use cache_key::{cache_digest, hash_digest};
use distribution_types::Resolution;
use uv_cache::{Cache, CacheBucket};
use uv_client::Connectivity;
@ -75,18 +74,12 @@ impl CachedEnvironment {
// Hash the resolution by hashing the generated lockfile.
// TODO(charlie): If the resolution contains any mutable metadata (like a path or URL
// dependency), skip this step.
// TODO(charlie): Consider implementing `CacheKey` for `Resolution`.
let resolution_hash = digest(
&resolution
.distributions()
.map(std::string::ToString::to_string)
.join("\n")
.as_bytes(),
);
let distributions = resolution.distributions().collect::<Vec<_>>();
let resolution_hash = hash_digest(&distributions);
// Hash the interpreter based on its path.
// TODO(charlie): Come up with a robust hash for the interpreter.
let interpreter_hash = digest(&interpreter.sys_executable());
let interpreter_hash = cache_digest(&interpreter.sys_executable());
// Search in the content-addressed cache.
let cache_entry = cache.entry(CacheBucket::Environments, interpreter_hash, resolution_hash);