refactor: Introduce CacheKey trait (#3323)

This PR introduces a new `CacheKey` trait for types that can be used as a cache key.

I'm not entirely sure if this is worth the "overhead", but I was surprised to find `HashableHashSet` and got scared when I looked at the time complexity of the `hash` function. These implementations must be extremely slow in hashed collections.

I then searched for usages and quickly realized that only the cache uses these `Hash` implementations, where performance is less sensitive.

This PR introduces a new `CacheKey` trait to communicate the difference between a hash and computing a key for the cache. The new trait can be implemented for types that don't implement `Hash` for performance reasons, and we can define additional constraints on the implementation:  For example, we'll want to enforce portability when we add remote caching support. Using a different trait further allows us not to implement it for types without stable identities (e.g. pointers) or use other implementations than the standard hash function.
This commit is contained in:
Micha Reiser 2023-03-03 19:29:49 +01:00 committed by GitHub
parent d1288dc2b1
commit cdbe2ee496
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
53 changed files with 842 additions and 331 deletions

View file

@ -0,0 +1,15 @@
[package]
name = "ruff_cache"
version = "0.0.0"
publish = false
edition = { workspace = true }
rust-version = { workspace = true }
[dependencies]
itertools = { workspace = true }
globset = { version = "0.4.9" }
regex = { workspace = true }
filetime = { version = "0.2.17" }
[dev-dependencies]
ruff_macros = { path = "../ruff_macros" }

View file

@ -0,0 +1,376 @@
use itertools::Itertools;
use regex::Regex;
use std::borrow::Cow;
use std::collections::hash_map::DefaultHasher;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::hash::{Hash, Hasher};
use std::ops::{Deref, DerefMut};
use std::path::{Path, PathBuf};
#[derive(Clone, Debug, Default)]
pub struct CacheKeyHasher {
inner: DefaultHasher,
}
impl CacheKeyHasher {
pub fn new() -> Self {
Self {
inner: DefaultHasher::new(),
}
}
}
impl Deref for CacheKeyHasher {
type Target = DefaultHasher;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl DerefMut for CacheKeyHasher {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
/// A type that be used as part of a cache key.
///
/// A cache looks up artefacts by a cache key. Many cache keys are composed of sub-keys. For example,
/// caching the lint results of a file depend at least on the file content, the user settings, and linter version.
/// Types implementing the [`CacheKey`] trait can be used as part of a cache key by which artefacts are queried.
///
/// ## Implementing `CacheKey`
///
/// You can derive [`CacheKey`] with `#[derive(CacheKey)]` if all fields implement [`CacheKey`]. The resulting
/// cache key will be the combination of the values from calling `cache_key` on each field.
///
/// ```
/// # use ruff_macros::CacheKey;
///
/// #[derive(CacheKey)]
/// struct Test {
/// name: String,
/// version: u32,
/// }
/// ```
///
/// If you need more control over computing the cache key, you can of course implement the [`CacheKey]` yourself:
///
/// ```
/// use ruff_cache::{CacheKey, CacheKeyHasher};
///
/// struct Test {
/// name: String,
/// version: u32,
/// other: String
/// }
///
/// impl CacheKey for Test {
/// fn cache_key(&self, state: &mut CacheKeyHasher) {
/// self.name.cache_key(state);
/// self.version.cache_key(state);
/// }
/// }
/// ```
///
/// ## Portability
///
/// Ideally, the cache key is portable across platforms but this is not yet a strict requirement.
///
/// ## Using [`Hash`]
///
/// You can defer to the [`Hash`] implementation for non-composite types.
/// Be aware, that the [`Hash`] implementation may not be portable.
///
/// ## Why a new trait rather than reusing [`Hash`]?
/// The main reason is that hashes and cache keys have different constraints:
///
/// * Cache keys are less performance sensitive: Hashes must be super fast to compute for performant hashed-collections. That's
/// why some standard types don't implement [`Hash`] where it would be safe to to implement [`CacheKey`], e.g. `HashSet`
/// * Cache keys must be deterministic where hash keys do not have this constraint. That's why pointers don't implement [`CacheKey`] but they implement [`Hash`].
/// * Ideally, cache keys are portable
///
/// [`Hash`](std::hash::Hash)
pub trait CacheKey {
fn cache_key(&self, state: &mut CacheKeyHasher);
fn cache_key_slice(data: &[Self], state: &mut CacheKeyHasher)
where
Self: Sized,
{
for piece in data {
piece.cache_key(state);
}
}
}
impl CacheKey for bool {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u8(u8::from(*self));
}
}
impl CacheKey for char {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u32(*self as u32);
}
}
impl CacheKey for usize {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(*self);
}
}
impl CacheKey for u128 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u128(*self);
}
}
impl CacheKey for u64 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u64(*self);
}
}
impl CacheKey for u32 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u32(*self);
}
}
impl CacheKey for u16 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u16(*self);
}
}
impl CacheKey for u8 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_u8(*self);
}
}
impl CacheKey for isize {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_isize(*self);
}
}
impl CacheKey for i128 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i128(*self);
}
}
impl CacheKey for i64 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i64(*self);
}
}
impl CacheKey for i32 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i32(*self);
}
}
impl CacheKey for i16 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i16(*self);
}
}
impl CacheKey for i8 {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_i8(*self);
}
}
macro_rules! impl_cache_key_tuple {
() => (
impl CacheKey for () {
#[inline]
fn cache_key(&self, _state: &mut CacheKeyHasher) {}
}
);
( $($name:ident)+) => (
impl<$($name: CacheKey),+> CacheKey for ($($name,)+) where last_type!($($name,)+): ?Sized {
#[allow(non_snake_case)]
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
let ($(ref $name,)+) = *self;
$($name.cache_key(state);)+
}
}
);
}
macro_rules! last_type {
($a:ident,) => { $a };
($a:ident, $($rest_a:ident,)+) => { last_type!($($rest_a,)+) };
}
impl_cache_key_tuple! {}
impl_cache_key_tuple! { T }
impl_cache_key_tuple! { T B }
impl_cache_key_tuple! { T B C }
impl_cache_key_tuple! { T B C D }
impl_cache_key_tuple! { T B C D E }
impl_cache_key_tuple! { T B C D E F }
impl_cache_key_tuple! { T B C D E F G }
impl_cache_key_tuple! { T B C D E F G H }
impl_cache_key_tuple! { T B C D E F G H I }
impl_cache_key_tuple! { T B C D E F G H I J }
impl_cache_key_tuple! { T B C D E F G H I J K }
impl_cache_key_tuple! { T B C D E F G H I J K L }
impl CacheKey for str {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.hash(&mut **state);
}
}
impl CacheKey for String {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.hash(&mut **state);
}
}
impl<T: CacheKey> CacheKey for Option<T> {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
match self {
None => state.write_usize(0),
Some(value) => {
state.write_usize(1);
value.cache_key(state);
}
}
}
}
impl<T: CacheKey> CacheKey for [T] {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
CacheKey::cache_key_slice(self, state);
}
}
impl<T: ?Sized + CacheKey> CacheKey for &T {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
(**self).cache_key(state);
}
}
impl<T: ?Sized + CacheKey> CacheKey for &mut T {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
(**self).cache_key(state);
}
}
impl<T> CacheKey for Vec<T>
where
T: CacheKey,
{
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
CacheKey::cache_key_slice(self, state);
}
}
impl<K, V, S> CacheKey for HashMap<K, V, S>
where
K: CacheKey + Ord,
V: CacheKey,
{
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
for (key, value) in self
.iter()
.sorted_by(|(left, _), (right, _)| left.cmp(right))
{
key.cache_key(state);
value.cache_key(state);
}
}
}
impl<V: CacheKey + Ord, S> CacheKey for HashSet<V, S> {
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
for value in self.iter().sorted() {
value.cache_key(state);
}
}
}
impl<V: CacheKey> CacheKey for BTreeSet<V> {
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
for item in self {
item.cache_key(state);
}
}
}
impl<K: CacheKey + Ord, V: CacheKey> CacheKey for BTreeMap<K, V> {
fn cache_key(&self, state: &mut CacheKeyHasher) {
state.write_usize(self.len());
for (key, value) in self {
key.cache_key(state);
value.cache_key(state);
}
}
}
impl CacheKey for Path {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.hash(&mut **state);
}
}
impl CacheKey for PathBuf {
#[inline]
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.hash(&mut **state);
}
}
impl<V: ?Sized> CacheKey for Cow<'_, V>
where
V: CacheKey + ToOwned,
{
fn cache_key(&self, state: &mut CacheKeyHasher) {
(**self).cache_key(state);
}
}
impl CacheKey for Regex {
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.as_str().cache_key(state);
}
}

View file

@ -0,0 +1,9 @@
use crate::{CacheKey, CacheKeyHasher};
use filetime::FileTime;
use std::hash::Hash;
impl CacheKey for FileTime {
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.hash(&mut **state);
}
}

View file

@ -0,0 +1,14 @@
use crate::{CacheKey, CacheKeyHasher};
use globset::{Glob, GlobMatcher};
impl CacheKey for GlobMatcher {
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.glob().cache_key(state);
}
}
impl CacheKey for Glob {
fn cache_key(&self, state: &mut CacheKeyHasher) {
self.glob().cache_key(state);
}
}

View file

@ -0,0 +1,15 @@
mod cache_key;
pub mod filetime;
pub mod globset;
pub use cache_key::{CacheKey, CacheKeyHasher};
use std::path::{Path, PathBuf};
pub const CACHE_DIR_NAME: &str = ".ruff_cache";
/// Return the cache directory for a given project root. Defers to the
/// `RUFF_CACHE_DIR` environment variable, if set.
pub fn cache_dir(project_root: &Path) -> PathBuf {
project_root.join(CACHE_DIR_NAME)
}

View file

@ -0,0 +1,108 @@
use ruff_cache::{CacheKey, CacheKeyHasher};
use ruff_macros::CacheKey;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
#[derive(CacheKey, Hash)]
struct UnitStruct;
#[derive(CacheKey, Hash)]
struct NamedFieldsStruct {
a: String,
b: String,
}
#[derive(CacheKey, Hash)]
struct UnnamedFieldsStruct(String, String);
#[derive(CacheKey, Hash)]
enum Enum {
Unit,
UnnamedFields(String, String),
NamedFields { a: String, b: String },
}
#[test]
fn unit_struct_cache_key() {
let mut key = CacheKeyHasher::new();
UnitStruct.cache_key(&mut key);
let mut hash = DefaultHasher::new();
UnitStruct.hash(&mut hash);
assert_eq!(hash.finish(), key.finish());
}
#[test]
fn named_field_struct() {
let mut key = CacheKeyHasher::new();
let named_fields = NamedFieldsStruct {
a: "Hello".into(),
b: "World".into(),
};
named_fields.cache_key(&mut key);
let mut hash = DefaultHasher::new();
named_fields.hash(&mut hash);
assert_eq!(hash.finish(), key.finish());
}
#[test]
fn unnamed_field_struct() {
let mut key = CacheKeyHasher::new();
let unnamed_fields = UnnamedFieldsStruct("Hello".into(), "World".into());
unnamed_fields.cache_key(&mut key);
let mut hash = DefaultHasher::new();
unnamed_fields.hash(&mut hash);
assert_eq!(hash.finish(), key.finish());
}
#[test]
fn enum_unit_variant() {
let mut key = CacheKeyHasher::new();
let variant = Enum::Unit;
variant.cache_key(&mut key);
let mut hash = DefaultHasher::new();
variant.hash(&mut hash);
assert_eq!(hash.finish(), key.finish());
}
#[test]
fn enum_named_fields_variant() {
let mut key = CacheKeyHasher::new();
let variant = Enum::NamedFields {
a: "Hello".to_string(),
b: "World".to_string(),
};
variant.cache_key(&mut key);
let mut hash = DefaultHasher::new();
variant.hash(&mut hash);
assert_eq!(hash.finish(), key.finish());
}
#[test]
fn enum_unnamed_fields_variant() {
let mut key = CacheKeyHasher::new();
let variant = Enum::UnnamedFields("Hello".to_string(), "World".to_string());
variant.cache_key(&mut key);
let mut hash = DefaultHasher::new();
variant.hash(&mut hash);
assert_eq!(hash.finish(), key.finish());
}