Replace PyPI-internal Hashes representation with flat vector (#2925)

## Summary

Right now, we have a `Hashes` representation that looks like:

```rust
/// A dictionary mapping a hash name to a hex encoded digest of the file.
///
/// PEP 691 says multiple hashes can be included and the interpretation is left to the client.
#[derive(Debug, Clone, Eq, PartialEq, Default, Deserialize)]
pub struct Hashes {
    pub md5: Option<Box<str>>,
    pub sha256: Option<Box<str>>,
    pub sha384: Option<Box<str>>,
    pub sha512: Option<Box<str>>,
}
```

It stems from the PyPI API, which returns a dictionary of hashes.

We tend to pass these around as a vector of `Vec<Hashes>`. But it's a
bit strange because each entry in that vector could contain multiple
hashes. And it makes it difficult to ask questions like "Is
`sha256:ab21378ca980a8` in the set of hashes"?

This PR instead treats `Hashes` as the PyPI-internal type, and uses a
new `Vec<HashDigest>` everywhere in our own APIs.
This commit is contained in:
Charlie Marsh 2024-04-09 12:56:16 -04:00 committed by GitHub
parent 1512e07a2e
commit 13ae5ac8dc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 195 additions and 103 deletions

View file

@ -7,7 +7,7 @@ use url::Url;
use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError};
use pep508_rs::split_scheme;
use pypi_types::{DistInfoMetadata, Hashes, Yanked};
use pypi_types::{DistInfoMetadata, HashDigest, Yanked};
/// Error converting [`pypi_types::File`] to [`distribution_type::File`].
#[derive(Debug, Error)]
@ -25,9 +25,9 @@ pub enum FileConversionError {
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
pub struct File {
pub dist_info_metadata: Option<DistInfoMetadata>,
pub dist_info_metadata: bool,
pub filename: String,
pub hashes: Hashes,
pub hashes: Vec<HashDigest>,
pub requires_python: Option<VersionSpecifiers>,
pub size: Option<u64>,
// N.B. We don't use a chrono DateTime<Utc> here because it's a little
@ -43,9 +43,12 @@ impl File {
/// `TryFrom` instead of `From` to filter out files with invalid requires python version specifiers
pub fn try_from(file: pypi_types::File, base: &Url) -> Result<Self, FileConversionError> {
Ok(Self {
dist_info_metadata: file.dist_info_metadata,
dist_info_metadata: file
.dist_info_metadata
.as_ref()
.is_some_and(DistInfoMetadata::is_available),
filename: file.filename,
hashes: file.hashes,
hashes: file.hashes.into_digests(),
requires_python: file
.requires_python
.transpose()

View file

@ -774,16 +774,16 @@ impl Identifier for Url {
impl Identifier for File {
fn distribution_id(&self) -> DistributionId {
if let Some(hash) = self.hashes.as_str() {
DistributionId::new(hash)
if let Some(hash) = self.hashes.first() {
DistributionId::new(&*hash.digest)
} else {
self.url.distribution_id()
}
}
fn resource_id(&self) -> ResourceId {
if let Some(hash) = self.hashes.as_str() {
ResourceId::new(hash)
if let Some(hash) = self.hashes.first() {
ResourceId::new(&*hash.digest)
} else {
self.url.resource_id()
}

View file

@ -2,7 +2,7 @@ use std::fmt::{Display, Formatter};
use pep440_rs::VersionSpecifiers;
use platform_tags::{IncompatibleTag, TagCompatibility, TagPriority};
use pypi_types::{Hashes, Yanked};
use pypi_types::{HashDigest, Yanked};
use crate::{Dist, InstalledDist, ResolvedDistRef};
@ -18,7 +18,7 @@ struct PrioritizedDistInner {
/// The highest-priority wheel.
wheel: Option<(Dist, WheelCompatibility)>,
/// The hashes for each distribution.
hashes: Vec<Hashes>,
hashes: Vec<HashDigest>,
}
/// A distribution that can be used for both resolution and installation.
@ -141,24 +141,28 @@ pub enum IncompatibleSource {
impl PrioritizedDist {
/// Create a new [`PrioritizedDist`] from the given wheel distribution.
pub fn from_built(dist: Dist, hash: Option<Hashes>, compatibility: WheelCompatibility) -> Self {
pub fn from_built(
dist: Dist,
hashes: Vec<HashDigest>,
compatibility: WheelCompatibility,
) -> Self {
Self(Box::new(PrioritizedDistInner {
wheel: Some((dist, compatibility)),
source: None,
hashes: hash.map(|hash| vec![hash]).unwrap_or_default(),
hashes,
}))
}
/// Create a new [`PrioritizedDist`] from the given source distribution.
pub fn from_source(
dist: Dist,
hash: Option<Hashes>,
hashes: Vec<HashDigest>,
compatibility: SourceDistCompatibility,
) -> Self {
Self(Box::new(PrioritizedDistInner {
wheel: None,
source: Some((dist, compatibility)),
hashes: hash.map(|hash| vec![hash]).unwrap_or_default(),
hashes,
}))
}
@ -166,7 +170,7 @@ impl PrioritizedDist {
pub fn insert_built(
&mut self,
dist: Dist,
hash: Option<Hashes>,
hashes: Vec<HashDigest>,
compatibility: WheelCompatibility,
) {
// Track the highest-priority wheel.
@ -178,16 +182,14 @@ impl PrioritizedDist {
self.0.wheel = Some((dist, compatibility));
}
if let Some(hash) = hash {
self.0.hashes.push(hash);
}
self.0.hashes.extend(hashes);
}
/// Insert the given source distribution into the [`PrioritizedDist`].
pub fn insert_source(
&mut self,
dist: Dist,
hash: Option<Hashes>,
hashes: Vec<HashDigest>,
compatibility: SourceDistCompatibility,
) {
// Track the highest-priority source.
@ -199,9 +201,7 @@ impl PrioritizedDist {
self.0.source = Some((dist, compatibility));
}
if let Some(hash) = hash {
self.0.hashes.push(hash);
}
self.0.hashes.extend(hashes);
}
/// Return the highest-priority distribution for the package version, if any.
@ -274,7 +274,7 @@ impl PrioritizedDist {
}
/// Return the hashes for each distribution.
pub fn hashes(&self) -> &[Hashes] {
pub fn hashes(&self) -> &[HashDigest] {
&self.0.hashes
}

View file

@ -68,11 +68,7 @@ where
))
}
#[derive(
Debug, Clone, Serialize, Deserialize, rkyv::Archive, rkyv::Deserialize, rkyv::Serialize,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
#[derive(Debug, Clone, Deserialize)]
#[serde(untagged)]
pub enum DistInfoMetadata {
Bool(bool),
@ -125,23 +121,7 @@ impl Default for Yanked {
/// A dictionary mapping a hash name to a hex encoded digest of the file.
///
/// PEP 691 says multiple hashes can be included and the interpretation is left to the client.
#[derive(
Debug,
Clone,
Ord,
PartialOrd,
Eq,
PartialEq,
Hash,
Default,
Serialize,
Deserialize,
rkyv::Archive,
rkyv::Deserialize,
rkyv::Serialize,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
#[derive(Debug, Clone, Eq, PartialEq, Default, Deserialize)]
pub struct Hashes {
pub md5: Option<Box<str>>,
pub sha256: Option<Box<str>>,
@ -150,31 +130,34 @@ pub struct Hashes {
}
impl Hashes {
/// Format as `<algorithm>:<hash>`.
pub fn to_string(&self) -> Option<String> {
self.sha512
.as_ref()
.map(|sha512| format!("sha512:{sha512}"))
.or_else(|| {
self.sha384
.as_ref()
.map(|sha384| format!("sha384:{sha384}"))
})
.or_else(|| {
self.sha256
.as_ref()
.map(|sha256| format!("sha256:{sha256}"))
})
.or_else(|| self.md5.as_ref().map(|md5| format!("md5:{md5}")))
}
/// Return the hash digest.
pub fn as_str(&self) -> Option<&str> {
self.sha512
.as_deref()
.or(self.sha384.as_deref())
.or(self.sha256.as_deref())
.or(self.md5.as_deref())
/// Convert a set of [`Hashes`] into a list of [`HashDigest`]s.
pub fn into_digests(self) -> Vec<HashDigest> {
let mut digests = Vec::new();
if let Some(sha512) = self.sha512 {
digests.push(HashDigest {
algorithm: HashAlgorithm::Sha512,
digest: sha512,
});
}
if let Some(sha384) = self.sha384 {
digests.push(HashDigest {
algorithm: HashAlgorithm::Sha384,
digest: sha384,
});
}
if let Some(sha256) = self.sha256 {
digests.push(HashDigest {
algorithm: HashAlgorithm::Sha256,
digest: sha256,
});
}
if let Some(md5) = self.md5 {
digests.push(HashDigest {
algorithm: HashAlgorithm::Md5,
digest: md5,
});
}
digests
}
}
@ -239,6 +222,118 @@ impl FromStr for Hashes {
}
}
#[derive(
Debug,
Clone,
Copy,
Ord,
PartialOrd,
Eq,
PartialEq,
Hash,
Serialize,
Deserialize,
rkyv::Archive,
rkyv::Deserialize,
rkyv::Serialize,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
pub enum HashAlgorithm {
Md5,
Sha256,
Sha384,
Sha512,
}
impl FromStr for HashAlgorithm {
type Err = HashError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"md5" => Ok(Self::Md5),
"sha256" => Ok(Self::Sha256),
"sha384" => Ok(Self::Sha384),
"sha512" => Ok(Self::Sha512),
_ => Err(HashError::UnsupportedHashAlgorithm(s.to_string())),
}
}
}
impl std::fmt::Display for HashAlgorithm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Md5 => write!(f, "md5"),
Self::Sha256 => write!(f, "sha256"),
Self::Sha384 => write!(f, "sha384"),
Self::Sha512 => write!(f, "sha512"),
}
}
}
/// A hash name and hex encoded digest of the file.
#[derive(
Debug,
Clone,
Ord,
PartialOrd,
Eq,
PartialEq,
Hash,
Serialize,
Deserialize,
rkyv::Archive,
rkyv::Deserialize,
rkyv::Serialize,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
pub struct HashDigest {
pub algorithm: HashAlgorithm,
pub digest: Box<str>,
}
impl HashDigest {
/// Return the [`HashAlgorithm`] of the digest.
pub fn algorithm(&self) -> HashAlgorithm {
self.algorithm
}
}
impl std::fmt::Display for HashDigest {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.algorithm, self.digest)
}
}
impl FromStr for HashDigest {
type Err = HashError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut parts = s.split(':');
// Extract the key and value.
let name = parts
.next()
.ok_or_else(|| HashError::InvalidStructure(s.to_string()))?;
let value = parts
.next()
.ok_or_else(|| HashError::InvalidStructure(s.to_string()))?;
// Ensure there are no more parts.
if parts.next().is_some() {
return Err(HashError::InvalidStructure(s.to_string()));
}
let algorithm = HashAlgorithm::from_str(name)?;
Ok(HashDigest {
algorithm,
digest: value.to_owned().into_boxed_str(),
})
}
}
#[derive(thiserror::Error, Debug)]
pub enum HashError {
#[error("Unexpected hash (expected `<algorithm>:<hash>`): {0}")]

View file

@ -598,7 +598,7 @@ impl CacheBucket {
Self::FlatIndex => "flat-index-v0",
Self::Git => "git-v0",
Self::Interpreter => "interpreter-v0",
Self::Simple => "simple-v6",
Self::Simple => "simple-v7",
Self::Wheels => "wheels-v0",
Self::Archive => "archive-v0",
}

View file

@ -17,7 +17,7 @@ use distribution_types::{
use pep440_rs::Version;
use pep508_rs::VerbatimUrl;
use platform_tags::Tags;
use pypi_types::Hashes;
use uv_cache::{Cache, CacheBucket};
use uv_configuration::{NoBinary, NoBuild};
use uv_normalize::PackageName;
@ -236,9 +236,9 @@ impl<'a> FlatIndexClient<'a> {
};
let file = File {
dist_info_metadata: None,
dist_info_metadata: false,
filename: filename.to_string(),
hashes: Hashes::default(),
hashes: Vec::new(),
requires_python: None,
size: None,
upload_time_utc_ms: None,
@ -323,10 +323,10 @@ impl FlatIndex {
}));
match distributions.0.entry(version) {
Entry::Occupied(mut entry) => {
entry.get_mut().insert_built(dist, None, compatibility);
entry.get_mut().insert_built(dist, vec![], compatibility);
}
Entry::Vacant(entry) => {
entry.insert(PrioritizedDist::from_built(dist, None, compatibility));
entry.insert(PrioritizedDist::from_built(dist, vec![], compatibility));
}
}
}
@ -339,10 +339,10 @@ impl FlatIndex {
}));
match distributions.0.entry(filename.version) {
Entry::Occupied(mut entry) => {
entry.get_mut().insert_source(dist, None, compatibility);
entry.get_mut().insert_source(dist, vec![], compatibility);
}
Entry::Vacant(entry) => {
entry.insert(PrioritizedDist::from_source(dist, None, compatibility));
entry.insert(PrioritizedDist::from_source(dist, vec![], compatibility));
}
}
}

View file

@ -424,11 +424,7 @@ impl RegistryClient {
) -> Result<Metadata23, Error> {
// If the metadata file is available at its own url (PEP 658), download it from there.
let filename = WheelFilename::from_str(&file.filename).map_err(ErrorKind::WheelFilename)?;
if file
.dist_info_metadata
.as_ref()
.is_some_and(pypi_types::DistInfoMetadata::is_available)
{
if file.dist_info_metadata {
let mut url = url.clone();
url.set_path(&format!("{}.metadata", url.path()));

View file

@ -6,7 +6,7 @@ use pep440_rs::{Operator, Version};
use pep508_rs::{
MarkerEnvironment, Requirement, RequirementsTxtRequirement, UnnamedRequirement, VersionOrUrl,
};
use pypi_types::{HashError, Hashes};
use pypi_types::{HashDigest, HashError};
use requirements_txt::RequirementEntry;
use tracing::trace;
use uv_normalize::PackageName;
@ -23,7 +23,7 @@ pub enum PreferenceError {
#[derive(Clone, Debug)]
pub struct Preference {
requirement: Requirement,
hashes: Vec<Hashes>,
hashes: Vec<HashDigest>,
}
impl Preference {
@ -40,7 +40,7 @@ impl Preference {
.hashes
.iter()
.map(String::as_str)
.map(Hashes::from_str)
.map(HashDigest::from_str)
.collect::<Result<_, _>>()?,
})
}
@ -146,7 +146,7 @@ impl Preferences {
&self,
package_name: &PackageName,
version: &Version,
) -> Option<&[Hashes]> {
) -> Option<&[HashDigest]> {
self.0
.get(package_name)
.filter(|pin| pin.version() == version)
@ -158,7 +158,7 @@ impl Preferences {
#[derive(Debug, Clone)]
struct Pin {
version: Version,
hashes: Vec<Hashes>,
hashes: Vec<HashDigest>,
}
impl Pin {
@ -168,7 +168,7 @@ impl Pin {
}
/// Return the hashes of the pinned package.
fn hashes(&self) -> &[Hashes] {
fn hashes(&self) -> &[HashDigest] {
&self.hashes
}
}

View file

@ -18,7 +18,7 @@ use distribution_types::{
use once_map::OnceMap;
use pep440_rs::Version;
use pep508_rs::MarkerEnvironment;
use pypi_types::Hashes;
use pypi_types::HashDigest;
use uv_distribution::to_precise;
use uv_normalize::{ExtraName, PackageName};
@ -50,7 +50,7 @@ pub struct ResolutionGraph {
/// The underlying graph.
petgraph: petgraph::graph::Graph<ResolvedDist, Range<Version>, petgraph::Directed>,
/// The metadata for every distribution in this resolution.
hashes: FxHashMap<PackageName, Vec<Hashes>>,
hashes: FxHashMap<PackageName, Vec<HashDigest>>,
/// The enabled extras for every distribution in this resolution.
extras: FxHashMap<PackageName, Vec<ExtraName>>,
/// The set of editable requirements in this resolution.
@ -649,12 +649,10 @@ impl std::fmt::Display for DisplayResolutionGraph<'_> {
.filter(|hashes| !hashes.is_empty())
{
for hash in hashes {
if let Some(hash) = hash.to_string() {
has_hashes = true;
line.push_str(" \\\n");
line.push_str(" --hash=");
line.push_str(&hash);
}
has_hashes = true;
line.push_str(" \\\n");
line.push_str(" --hash=");
line.push_str(&hash.to_string());
}
}
}

View file

@ -12,7 +12,7 @@ use distribution_types::{
};
use pep440_rs::{Version, VersionSpecifiers};
use platform_tags::Tags;
use pypi_types::{Hashes, Yanked};
use pypi_types::{HashDigest, Yanked};
use rkyv::{de::deserializers::SharedDeserializeMap, Deserialize};
use uv_client::{FlatDistributions, OwnedArchive, SimpleMetadata, VersionFiles};
use uv_configuration::{NoBinary, NoBuild};
@ -176,7 +176,7 @@ impl VersionMap {
}
/// Return the [`Hashes`] for the given version, if any.
pub(crate) fn hashes(&self, version: &Version) -> Option<Vec<Hashes>> {
pub(crate) fn hashes(&self, version: &Version) -> Option<Vec<HashDigest>> {
match self.inner {
VersionMapInner::Eager(ref map) => map.get(version).map(|file| file.hashes().to_vec()),
VersionMapInner::Lazy(ref lazy) => lazy.get(version).map(|file| file.hashes().to_vec()),
@ -378,7 +378,7 @@ impl VersionMapLazy {
let version = filename.version().clone();
let requires_python = file.requires_python.clone();
let yanked = file.yanked.clone();
let hash = file.hashes.clone();
let hashes = file.hashes.clone();
match filename {
DistFilename::WheelFilename(filename) => {
let compatibility = self.wheel_compatibility(
@ -394,7 +394,7 @@ impl VersionMapLazy {
file,
self.index.clone(),
);
priority_dist.insert_built(dist, Some(hash), compatibility);
priority_dist.insert_built(dist, hashes, compatibility);
}
DistFilename::SourceDistFilename(filename) => {
let compatibility = self.source_dist_compatibility(
@ -409,7 +409,7 @@ impl VersionMapLazy {
file,
self.index.clone(),
);
priority_dist.insert_source(dist, Some(hash), compatibility);
priority_dist.insert_source(dist, hashes, compatibility);
}
}
}