Add stable hash crate (#281)

This PR adds a `puffin-cache` crate that we can share across a variety of
other crates to generate stable hashes.
This commit is contained in:
Charlie Marsh 2023-11-01 16:41:45 -07:00 committed by GitHub
parent 67e3e45839
commit 8123e1a8f6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 602 additions and 23 deletions

17
Cargo.lock generated
View file

@ -1969,6 +1969,15 @@ dependencies = [
"zip",
]
[[package]]
name = "puffin-cache"
version = "0.0.1"
dependencies = [
"hex",
"seahash",
"url",
]
[[package]]
name = "puffin-cli"
version = "0.0.1"
@ -2091,9 +2100,9 @@ name = "puffin-distribution"
version = "0.1.0"
dependencies = [
"anyhow",
"distribution-filename",
"hex",
"pep440_rs 0.3.12",
"puffin-cache",
"puffin-package",
"sha2",
"url",
@ -2713,6 +2722,12 @@ dependencies = [
"untrusted",
]
[[package]]
name = "seahash"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "serde"
version = "1.0.190"

View file

@ -0,0 +1,16 @@
[package]
name = "puffin-cache"
version = "0.0.1"
description = "Generate stable hash digests across versions and platforms."
edition = { workspace = true }
rust-version = { workspace = true }
homepage = { workspace = true }
documentation = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
[dependencies]
hex = { workspace = true }
seahash = { workspace = true }
url = { workspace = true }

View file

@ -0,0 +1,356 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet};
use std::hash::{Hash, Hasher};
use std::num::{
NonZeroI128, NonZeroI16, NonZeroI32, NonZeroI64, NonZeroI8, NonZeroU128, NonZeroU16,
NonZeroU32, NonZeroU64, NonZeroU8,
};
use seahash::SeaHasher;
/// A trait for types that can be hashed in a stable way across versions and platforms. Equivalent
/// to Ruff's [`CacheKey`] trait.
pub trait CacheKey {
fn cache_key(&self, state: &mut CacheKeyHasher);
fn cache_key_slice(data: &[Self], state: &mut CacheKeyHasher)
where
Self: Sized,
{
for piece in data {
piece.cache_key(state);
}
}
}
impl CacheKey for bool {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u8(u8::from(*self));
}
}
impl CacheKey for char {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u32(*self as u32);
}
}
impl CacheKey for usize {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(*self);
}
}
impl CacheKey for u128 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u128(*self);
}
}
impl CacheKey for u64 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u64(*self);
}
}
impl CacheKey for u32 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u32(*self);
}
}
impl CacheKey for u16 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u16(*self);
}
}
impl CacheKey for u8 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u8(*self);
}
}
impl CacheKey for isize {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_isize(*self);
}
}
impl CacheKey for i128 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i128(*self);
}
}
impl CacheKey for i64 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i64(*self);
}
}
impl CacheKey for i32 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i32(*self);
}
}
impl CacheKey for i16 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i16(*self);
}
}
impl CacheKey for i8 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i8(*self);
}
}
macro_rules! impl_cache_key_non_zero {
($name:ident) => {
impl CacheKey for $name {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.get().cache_key(state)
}
}
};
}
impl_cache_key_non_zero!(NonZeroU8);
impl_cache_key_non_zero!(NonZeroU16);
impl_cache_key_non_zero!(NonZeroU32);
impl_cache_key_non_zero!(NonZeroU64);
impl_cache_key_non_zero!(NonZeroU128);
impl_cache_key_non_zero!(NonZeroI8);
impl_cache_key_non_zero!(NonZeroI16);
impl_cache_key_non_zero!(NonZeroI32);
impl_cache_key_non_zero!(NonZeroI64);
impl_cache_key_non_zero!(NonZeroI128);
macro_rules! impl_cache_key_tuple {
() => (
impl CacheKey for () {
#[inline]
fn cache_key(&self, _state: &mut CacheKeyHasher) {}
}
);
( $($name:ident)+) => (
impl<$($name: CacheKey),+> CacheKey for ($($name,)+) where last_type!($($name,)+): ?Sized {
#[allow(non_snake_case)]
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
let ($(ref $name,)+) = *self;
$($name.cache_key(state);)+
}
}
);
}
macro_rules! last_type {
($a:ident,) => { $a };
($a:ident, $($rest_a:ident,)+) => { last_type!($($rest_a,)+) };
}
impl_cache_key_tuple! {}
impl_cache_key_tuple! { T }
impl_cache_key_tuple! { T B }
impl_cache_key_tuple! { T B C }
impl_cache_key_tuple! { T B C D }
impl_cache_key_tuple! { T B C D E }
impl_cache_key_tuple! { T B C D E F }
impl_cache_key_tuple! { T B C D E F G }
impl_cache_key_tuple! { T B C D E F G H }
impl_cache_key_tuple! { T B C D E F G H I }
impl_cache_key_tuple! { T B C D E F G H I J }
impl_cache_key_tuple! { T B C D E F G H I J K }
impl_cache_key_tuple! { T B C D E F G H I J K L }
impl CacheKey for str {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.hash(&mut *state);
}
}
impl CacheKey for String {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.hash(&mut *state);
}
}
impl<T: CacheKey> CacheKey for Option<T> {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
match self {
None => state.write_usize(0),
Some(value) => {
state.write_usize(1);
value.cache_key(state);
}
}
}
}
impl<T: CacheKey> CacheKey for [T] {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
CacheKey::cache_key_slice(self, state);
}
}
impl<T: ?Sized + CacheKey> CacheKey for &T {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
(**self).cache_key(state);
}
}
impl<T: ?Sized + CacheKey> CacheKey for &mut T {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
(**self).cache_key(state);
}
}
impl<T> CacheKey for Vec<T>
where
T: CacheKey,
{
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
CacheKey::cache_key_slice(self, state);
}
}
impl<V: CacheKey> CacheKey for BTreeSet<V> {
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
for item in self {
item.cache_key(state);
}
}
}
impl<K: CacheKey + Ord, V: CacheKey> CacheKey for BTreeMap<K, V> {
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
for (key, value) in self {
key.cache_key(state);
value.cache_key(state);
}
}
}
impl<V: ?Sized> CacheKey for Cow<'_, V>
where
V: CacheKey + ToOwned,
{
fn cache_key(&self, state: &mut CacheKeyHasher) {
(**self).cache_key(state);
}
}
#[derive(Clone, Default)]
pub struct CacheKeyHasher {
inner: SeaHasher,
}
impl CacheKeyHasher {
pub fn new() -> Self {
Self {
inner: SeaHasher::new(),
}
}
}
impl Hasher for CacheKeyHasher {
#[inline]
fn finish(&self) -> u64 {
self.inner.finish()
}
#[inline]
fn write(&mut self, bytes: &[u8]) {
self.inner.write(bytes);
}
#[inline]
fn write_u8(&mut self, i: u8) {
self.inner.write_u8(i);
}
#[inline]
fn write_u16(&mut self, i: u16) {
self.inner.write_u16(i);
}
#[inline]
fn write_u32(&mut self, i: u32) {
self.inner.write_u32(i);
}
#[inline]
fn write_u64(&mut self, i: u64) {
self.inner.write_u64(i);
}
#[inline]
fn write_u128(&mut self, i: u128) {
self.inner.write_u128(i);
}
#[inline]
fn write_usize(&mut self, i: usize) {
self.inner.write_usize(i);
}
#[inline]
fn write_i8(&mut self, i: i8) {
self.inner.write_i8(i);
}
#[inline]
fn write_i16(&mut self, i: i16) {
self.inner.write_i16(i);
}
#[inline]
fn write_i32(&mut self, i: i32) {
self.inner.write_i32(i);
}
#[inline]
fn write_i64(&mut self, i: i64) {
self.inner.write_i64(i);
}
#[inline]
fn write_i128(&mut self, i: i128) {
self.inner.write_i128(i);
}
#[inline]
fn write_isize(&mut self, i: isize) {
self.inner.write_isize(i);
}
}

View file

@ -0,0 +1,61 @@
use url::Url;
use crate::cache_key::{CacheKey, CacheKeyHasher};
/// A wrapper around `Url` which represents a "canonical" version of an
/// original URL.
///
/// A "canonical" url is only intended for internal comparison purposes. It's
/// to help paper over mistakes such as depending on `github.com/foo/bar` vs.
/// `github.com/foo/bar.git`. This is **only** for internal purposes and
/// provides no means to actually read the underlying string value of the `Url`
/// it contains. This is intentional, because all fetching should still happen
/// within the context of the original URL.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct CanonicalUrl(Url);
impl CanonicalUrl {
pub fn new(url: &Url) -> CanonicalUrl {
let mut url = url.clone();
// Strip a trailing slash.
if url.path().ends_with('/') {
url.path_segments_mut().unwrap().pop_if_empty();
}
// For GitHub URLs specifically, just lower-case everything. GitHub
// treats both the same, but they hash differently, and we're gonna be
// hashing them. This wants a more general solution, and also we're
// almost certainly not using the same case conversion rules that GitHub
// does. (See issue #84)
if url.host_str() == Some("github.com") {
url = format!("https{}", &url[url::Position::AfterScheme..])
.parse()
.unwrap();
let path = url.path().to_lowercase();
url.set_path(&path);
}
// Repos can generally be accessed with or without `.git` extension.
let needs_chopping = std::path::Path::new(url.path())
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("git"));
if needs_chopping {
let last = {
let last = url.path_segments().unwrap().next_back().unwrap();
last[..last.len() - 4].to_owned()
};
url.path_segments_mut().unwrap().pop().push(&last);
}
CanonicalUrl(url)
}
}
impl CacheKey for CanonicalUrl {
fn cache_key(&self, state: &mut CacheKeyHasher) {
// `as_str` gives the serialisation of a url (which has a spec) and so insulates against
// possible changes in how the URL crate does hashing.
self.0.as_str().cache_key(state);
}
}

View file

@ -0,0 +1,22 @@
use std::hash::Hasher;
use crate::cache_key::{CacheKey, CacheKeyHasher};
/// Compute a hex string hash of a [`CacheKey`] object.
///
/// The value returned by [`digest`] should be stable across releases and platforms.
pub fn digest<H: CacheKey>(hashable: &H) -> String {
to_hex(cache_key_u64(hashable))
}
/// Convert a u64 to a hex string.
fn to_hex(num: u64) -> String {
hex::encode(num.to_le_bytes())
}
/// Compute a u64 hash of a [`CacheKey`] object.
fn cache_key_u64<H: CacheKey>(hashable: &H) -> u64 {
let mut hasher = CacheKeyHasher::new();
hashable.cache_key(&mut hasher);
hasher.finish()
}

View file

@ -0,0 +1,113 @@
use std::hash::Hasher;
use seahash::SeaHasher;
pub use canonical_url::CanonicalUrl;
pub use digest::digest;
mod cache_key;
mod canonical_url;
mod digest;
/// A trait for types that can be hashed in a stable way across versions and platforms.
pub trait StableHash {
fn stable_hash(&self, state: &mut StableHasher);
fn stable_hash_slice(data: &[Self], state: &mut StableHasher)
where
Self: Sized,
{
for piece in data {
piece.stable_hash(state);
}
}
}
#[derive(Clone, Default)]
pub struct StableHasher {
inner: SeaHasher,
}
impl StableHasher {
pub fn new() -> Self {
Self {
inner: SeaHasher::new(),
}
}
pub fn finish(self) -> u64 {
self.inner.finish()
}
}
impl Hasher for StableHasher {
#[inline]
fn finish(&self) -> u64 {
self.inner.finish()
}
#[inline]
fn write(&mut self, bytes: &[u8]) {
self.inner.write(bytes);
}
#[inline]
fn write_u8(&mut self, i: u8) {
self.inner.write_u8(i);
}
#[inline]
fn write_u16(&mut self, i: u16) {
self.inner.write_u16(i);
}
#[inline]
fn write_u32(&mut self, i: u32) {
self.inner.write_u32(i);
}
#[inline]
fn write_u64(&mut self, i: u64) {
self.inner.write_u64(i);
}
#[inline]
fn write_u128(&mut self, i: u128) {
self.inner.write_u128(i);
}
#[inline]
fn write_usize(&mut self, i: usize) {
self.inner.write_usize(i);
}
#[inline]
fn write_i8(&mut self, i: i8) {
self.inner.write_i8(i);
}
#[inline]
fn write_i16(&mut self, i: i16) {
self.inner.write_i16(i);
}
#[inline]
fn write_i32(&mut self, i: i32) {
self.inner.write_i32(i);
}
#[inline]
fn write_i64(&mut self, i: i64) {
self.inner.write_i64(i);
}
#[inline]
fn write_i128(&mut self, i: i128) {
self.inner.write_i128(i);
}
#[inline]
fn write_isize(&mut self, i: isize) {
self.inner.write_isize(i);
}
}

View file

@ -478,7 +478,7 @@ optional-dependencies.foo = [
insta::with_settings!({
filters => vec![
(r"\d+(ms|s)", "[TIME]"),
(r"(\d|\.)+(ms|s)", "[TIME]"),
(r"# .* pip-compile", "# [BIN_PATH] pip-compile"),
(r"--cache-dir .*", "--cache-dir [CACHE_DIR]"),
]
@ -739,7 +739,7 @@ optional-dependencies.bar = [
insta::with_settings!({
filters => vec![
(r"\d+(ms|s)", "[TIME]"),
(r"(\d|\.)+(ms|s)", "[TIME]"),
(r"# .* pip-compile", "# [BIN_PATH] pip-compile"),
(r"--cache-dir .*", "--cache-dir [CACHE_DIR]"),
]
@ -794,7 +794,7 @@ optional-dependencies.bar = [
insta::with_settings!({
filters => vec![
(r"\d+(ms|s)", "[TIME]"),
(r"(\d|\.)+(ms|s)", "[TIME]"),
(r"# .* pip-compile", "# [BIN_PATH] pip-compile"),
(r"--cache-dir .*", "--cache-dir [CACHE_DIR]"),
]

View file

@ -7,9 +7,9 @@ info:
- pyproject.toml
- "--all-extras"
- "--cache-dir"
- /var/folders/bc/qlsk3t6x7c9fhhbvvcg68k9c0000gp/T/.tmpw8DJ9R
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpFzJKRe
env:
VIRTUAL_ENV: /var/folders/bc/qlsk3t6x7c9fhhbvvcg68k9c0000gp/T/.tmppqOrk1/.venv
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpdadcmu/.venv
---
success: true
exit_code: 0

View file

@ -6,16 +6,16 @@ info:
- pip-compile
- requirements.in
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpqLat7L
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpsB5jkv
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpgjHydf/.venv
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpIlvbvA/.venv
---
success: true
exit_code: 0
----- stdout -----
# This file was autogenerated by Puffin v0.0.1 via the following command:
# [BIN_PATH] pip-compile requirements.in --cache-dir [CACHE_DIR]
blinker==1.6.3
blinker==1.7.0
# via flask
click==8.1.7
# via flask

View file

@ -6,16 +6,16 @@ info:
- pip-compile
- requirements.in
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmppB5CDv
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpEo3Gie
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpYYP3a4/.venv
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpfMx1of/.venv
---
success: true
exit_code: 0
----- stdout -----
# This file was autogenerated by Puffin v0.0.1 via the following command:
# [BIN_PATH] pip-compile requirements.in --cache-dir [CACHE_DIR]
blinker==1.6.3
blinker==1.7.0
# via flask
click==8.1.7
# via flask

View file

@ -6,16 +6,16 @@ info:
- pip-compile
- requirements.in
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpax2HqL
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpKbcIw1
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpRwQW5s/.venv
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpacloOK/.venv
---
success: true
exit_code: 0
----- stdout -----
# This file was autogenerated by Puffin v0.0.1 via the following command:
# [BIN_PATH] pip-compile requirements.in --cache-dir [CACHE_DIR]
blinker==1.6.3
blinker==1.7.0
# via flask
click==8.1.7
# via flask

View file

@ -10,8 +10,8 @@ authors = { workspace = true }
license = { workspace = true }
[dependencies]
distribution-filename = { path = "../distribution-filename" }
pep440_rs = { path = "../pep440-rs" }
puffin-cache = { path = "../puffin-cache" }
puffin-package = { path = "../puffin-package" }
anyhow = { workspace = true }

View file

@ -7,6 +7,7 @@ use sha2::{Digest, Sha256};
use url::Url;
use pep440_rs::Version;
use puffin_cache::CanonicalUrl;
use puffin_package::dist_info_name::DistInfoName;
use puffin_package::package_name::PackageName;
use puffin_package::pypi_types::File;
@ -119,12 +120,7 @@ impl RemoteDistribution {
Self::Registry(name, version, _) => {
format!("{}-{}", DistInfoName::from(name), version)
}
Self::Url(_name, url) => {
let mut hasher = Sha256::new();
hasher.update(url.as_str().as_bytes());
let result = hasher.finalize();
hex::encode(result)
}
Self::Url(_name, url) => puffin_cache::digest(&CanonicalUrl::new(url)),
}
}
}

View file

@ -79,7 +79,7 @@ impl<'a> Downloader<'a> {
}
}
#[derive(Debug, Clone)]
#[derive(Debug)]
pub struct InMemoryDistribution {
/// The remote file from which this wheel was downloaded.
pub(crate) remote: RemoteDistribution,