From ef7be9103c691565739e3da5ba2c0725f41861c2 Mon Sep 17 00:00:00 2001 From: Zanie Blue Date: Thu, 7 Dec 2023 11:04:47 -0600 Subject: [PATCH] Parse `SimpleJson` into categorized data in the client (#522) Extends #517 with a suggestion from @konstin to parse the `SimpleJson` into an intermediate type `SimpleMetadata(BTreeMap)` before converting to a `VersionMap`. This reduces the number of times we need to parse the response. Additionally, we cache the parsed response now instead of `SimpleJson`. `VersionFiles` stores two vectors with `WheelFilename`/`SourceDistFilename` and `File` tuples. These can be iterated over together or separately. A new enum `DistFilename` was added to capture the `SourceDistFilename` and `WheelFilename` variants allowing iteration over both vectors. --- Cargo.lock | 1 + crates/distribution-filename/src/lib.rs | 23 +++ .../distribution-filename/src/source_dist.rs | 5 +- crates/puffin-cache/src/lib.rs | 2 +- crates/puffin-client/Cargo.toml | 1 + crates/puffin-client/src/lib.rs | 2 +- crates/puffin-client/src/registry_client.rs | 87 +++++++++++- crates/puffin-resolver/src/finder.rs | 110 ++++++++------- crates/puffin-resolver/src/resolver.rs | 1 + crates/puffin-resolver/src/version_map.rs | 133 +++++++++--------- 10 files changed, 233 insertions(+), 132 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9eb25f967..33f31b1b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2365,6 +2365,7 @@ dependencies = [ "http", "http-cache-semantics", "install-wheel-rs", + "pep440_rs 0.3.12", "puffin-cache", "puffin-fs", "puffin-normalize", diff --git a/crates/distribution-filename/src/lib.rs b/crates/distribution-filename/src/lib.rs index 60fcbbf77..7e4072f14 100644 --- a/crates/distribution-filename/src/lib.rs +++ b/crates/distribution-filename/src/lib.rs @@ -1,5 +1,28 @@ +use std::str::FromStr; + pub use source_dist::{SourceDistExtension, SourceDistFilename, SourceDistFilenameError}; pub use wheel::{WheelFilename, WheelFilenameError}; mod source_dist; mod wheel; + +#[derive(Debug, Clone)] +pub enum DistFilename { + SourceDistFilename(SourceDistFilename), + WheelFilename(WheelFilename), +} + +impl DistFilename { + pub fn try_from_filename( + filename: &str, + package_name: &puffin_normalize::PackageName, + ) -> Option { + if let Ok(filename) = WheelFilename::from_str(filename) { + Some(Self::WheelFilename(filename)) + } else if let Ok(filename) = SourceDistFilename::parse(filename, package_name) { + Some(Self::SourceDistFilename(filename)) + } else { + None + } + } +} diff --git a/crates/distribution-filename/src/source_dist.rs b/crates/distribution-filename/src/source_dist.rs index 37f846dcf..10ffb5927 100644 --- a/crates/distribution-filename/src/source_dist.rs +++ b/crates/distribution-filename/src/source_dist.rs @@ -1,12 +1,14 @@ use std::fmt::{Display, Formatter}; use std::str::FromStr; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use thiserror::Error; use pep440_rs::Version; use puffin_normalize::{InvalidNameError, PackageName}; -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum SourceDistExtension { Zip, TarGz, @@ -48,6 +50,7 @@ impl SourceDistExtension { /// Note that this is a normalized and not an exact representation, keep the original string if you /// need the latter. #[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct SourceDistFilename { pub name: PackageName, pub version: Version, diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index 196d50c82..4028163b9 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -370,7 +370,7 @@ pub enum CacheBucket { /// * `simple-v0/pypi/.json` /// * `simple-v0//.json` /// - /// TODO(konstin): Link to the json type after + /// The response is parsed into [`puffin_client::SimpleMetadata`] before storage. Simple, } diff --git a/crates/puffin-client/Cargo.toml b/crates/puffin-client/Cargo.toml index 111ea4c72..c8ffe7212 100644 --- a/crates/puffin-client/Cargo.toml +++ b/crates/puffin-client/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" distribution-filename = { path = "../distribution-filename" } distribution-types = { path = "../distribution-types" } install-wheel-rs = { path = "../install-wheel-rs" } +pep440_rs = { path = "../pep440-rs" } puffin-cache = { path = "../puffin-cache" } puffin-fs = { path = "../puffin-fs" } puffin-normalize = { path = "../puffin-normalize" } diff --git a/crates/puffin-client/src/lib.rs b/crates/puffin-client/src/lib.rs index d9655f3fe..27d5e05bf 100644 --- a/crates/puffin-client/src/lib.rs +++ b/crates/puffin-client/src/lib.rs @@ -1,6 +1,6 @@ pub use cached_client::{CachedClient, CachedClientError, DataWithCachePolicy}; pub use error::Error; -pub use registry_client::{RegistryClient, RegistryClientBuilder}; +pub use registry_client::{RegistryClient, RegistryClientBuilder, SimpleMetadata}; mod cached_client; mod error; diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index cbd744fc9..348df4057 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use std::fmt::Debug; use std::path::Path; use std::str::FromStr; @@ -8,15 +9,17 @@ use futures::TryStreamExt; use reqwest::{Client, ClientBuilder, Response, StatusCode}; use reqwest_retry::policies::ExponentialBackoff; use reqwest_retry::RetryTransientMiddleware; +use serde::{Deserialize, Serialize}; use tempfile::tempfile_in; use tokio::io::BufWriter; use tokio_util::compat::FuturesAsyncReadCompatExt; use tracing::{debug, trace}; use url::Url; -use distribution_filename::WheelFilename; +use distribution_filename::{DistFilename, SourceDistFilename, WheelFilename}; use distribution_types::{BuiltDist, Metadata}; use install_wheel_rs::find_dist_info; +use pep440_rs::Version; use puffin_cache::{digest, Cache, CacheBucket, CanonicalUrl, WheelCache}; use puffin_normalize::PackageName; use pypi_types::{File, IndexUrl, IndexUrls, Metadata21, SimpleJson}; @@ -122,7 +125,7 @@ impl RegistryClient { pub async fn simple( &self, package_name: &PackageName, - ) -> Result<(IndexUrl, SimpleJson), Error> { + ) -> Result<(IndexUrl, SimpleMetadata), Error> { if self.index_urls.no_index() { return Err(Error::NoIndex(package_name.as_ref().to_string())); } @@ -155,7 +158,8 @@ impl RegistryClient { let bytes = response.bytes().await?; let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) .map_err(|err| Error::from_json_err(err, url))?; - Ok(data) + let metadata = SimpleMetadata::from_package_simple_json(package_name, data); + Ok(metadata) }; let result = self .client @@ -164,8 +168,8 @@ impl RegistryClient { // Fetch from the index. match result { - Ok(simple_json) => { - return Ok((index.clone(), simple_json)); + Ok(simple_metadata) => { + return Ok((index.clone(), simple_metadata)); } Err(CachedClientError::Client(Error::RequestError(err))) => { if err.status() == Some(StatusCode::NOT_FOUND) { @@ -384,3 +388,76 @@ impl RegistryClient { )) } } + +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct VersionFiles { + pub wheels: Vec<(WheelFilename, File)>, + pub source_dists: Vec<(SourceDistFilename, File)>, +} + +impl VersionFiles { + fn push(&mut self, filename: DistFilename, file: File) { + match filename { + DistFilename::WheelFilename(inner) => self.wheels.push((inner, file)), + DistFilename::SourceDistFilename(inner) => self.source_dists.push((inner, file)), + } + } + + pub fn all(self) -> impl Iterator { + self.wheels + .into_iter() + .map(|(filename, file)| (DistFilename::WheelFilename(filename), file)) + .chain( + self.source_dists + .into_iter() + .map(|(filename, file)| (DistFilename::SourceDistFilename(filename), file)), + ) + } +} + +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct SimpleMetadata(BTreeMap); + +impl SimpleMetadata { + pub fn iter(&self) -> impl DoubleEndedIterator { + self.0.iter() + } + + fn from_package_simple_json(package_name: &PackageName, simple_json: SimpleJson) -> Self { + let mut metadata = Self::default(); + + // Group the distributions by version and kind + for file in simple_json.files { + if let Some(filename) = + DistFilename::try_from_filename(file.filename.as_str(), package_name) + { + let version = match filename { + DistFilename::SourceDistFilename(ref inner) => &inner.version, + DistFilename::WheelFilename(ref inner) => &inner.version, + }; + + match metadata.0.entry(version.clone()) { + std::collections::btree_map::Entry::Occupied(mut entry) => { + entry.get_mut().push(filename, file); + } + std::collections::btree_map::Entry::Vacant(entry) => { + let mut files = VersionFiles::default(); + files.push(filename, file); + entry.insert(files); + } + } + } + } + + metadata + } +} + +impl IntoIterator for SimpleMetadata { + type Item = (Version, VersionFiles); + type IntoIter = std::collections::btree_map::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} diff --git a/crates/puffin-resolver/src/finder.rs b/crates/puffin-resolver/src/finder.rs index 20b4209ff..26914c447 100644 --- a/crates/puffin-resolver/src/finder.rs +++ b/crates/puffin-resolver/src/finder.rs @@ -3,21 +3,19 @@ //! This is similar to running `pip install` with the `--no-deps` flag. use std::hash::BuildHasherDefault; -use std::str::FromStr; use anyhow::Result; use futures::StreamExt; use fxhash::FxHashMap; -use distribution_filename::{SourceDistFilename, WheelFilename}; use distribution_types::Dist; use pep440_rs::Version; use pep508_rs::{Requirement, VersionOrUrl}; use platform_tags::{TagPriority, Tags}; -use puffin_client::RegistryClient; +use puffin_client::{RegistryClient, SimpleMetadata}; use puffin_interpreter::Interpreter; use puffin_normalize::PackageName; -use pypi_types::{File, IndexUrl, SimpleJson}; +use pypi_types::IndexUrl; use crate::error::ResolveError; use crate::resolution::Resolution; @@ -108,8 +106,7 @@ impl<'a> DistFinder<'a> { match result { Response::Package(requirement, index, metadata) => { // Pick a version that satisfies the requirement. - let Some(distribution) = self.select(&requirement, &index, metadata.files) - else { + let Some(distribution) = self.select(&requirement, &index, metadata) else { return Err(ResolveError::NotFound(requirement)); }; @@ -141,77 +138,82 @@ impl<'a> DistFinder<'a> { &self, requirement: &Requirement, index: &IndexUrl, - files: Vec, + metadata: SimpleMetadata, ) -> Option { let mut best_version: Option = None; let mut best_wheel: Option<(Dist, TagPriority)> = None; let mut best_sdist: Option = None; - for file in files.into_iter().rev() { - // Only add dists compatible with the python version. - // This is relevant for source dists which give no other indication of their - // compatibility and wheels which may be tagged `py3-none-any` but - // have `requires-python: ">=3.9"` - // TODO(konstin): https://github.com/astral-sh/puffin/issues/406 - if file - .requires_python + for (version, files) in metadata.into_iter().rev() { + // If we iterated past the first-compatible version, break. + if best_version .as_ref() - .is_some_and(|requires_python| { - !requires_python.contains(self.interpreter.version()) - }) + .is_some_and(|best_version| *best_version != version) { + break; + } + + // If the version does not satisfy the requirement, continue. + if !requirement.is_satisfied_by(&version) { continue; } - // Find the most-compatible wheel. - if let Ok(wheel) = WheelFilename::from_str(file.filename.as_str()) { - // If we iterated past the first-compatible version, break. - if best_version + // Find the most-compatible wheel + for (wheel, file) in files.wheels { + // Only add dists compatible with the python version. + // This is relevant for source dists which give no other indication of their + // compatibility and wheels which may be tagged `py3-none-any` but + // have `requires-python: ">=3.9"` + // TODO(konstin): https://github.com/astral-sh/puffin/issues/406 + if !file + .requires_python .as_ref() - .is_some_and(|version| *version != wheel.version) + .map_or(true, |requires_python| { + requires_python.contains(self.interpreter.version()) + }) { - break; + continue; } - if requirement.is_satisfied_by(&wheel.version) { - best_version = Some(wheel.version.clone()); - if let Some(priority) = wheel.compatibility(self.tags) { - if best_wheel - .as_ref() - .map_or(true, |(.., existing)| priority > *existing) - { - best_wheel = Some(( - Dist::from_registry(wheel.name, wheel.version, file, index.clone()), - priority, - )); - } + best_version = Some(version.clone()); + if let Some(priority) = wheel.compatibility(self.tags) { + if best_wheel + .as_ref() + .map_or(true, |(.., existing)| priority > *existing) + { + best_wheel = Some(( + Dist::from_registry(wheel.name, wheel.version, file, index.clone()), + priority, + )); } } - continue; } // Find the most-compatible sdist, if no wheel was found. if best_wheel.is_none() { - if let Ok(sdist) = - SourceDistFilename::parse(file.filename.as_str(), &requirement.name) - { - // If we iterated past the first-compatible version, break. - if best_version + for (sdist, file) in files.source_dists { + // Only add dists compatible with the python version. + // This is relevant for source dists which give no other indication of their + // compatibility and wheels which may be tagged `py3-none-any` but + // have `requires-python: ">=3.9"` + // TODO(konstin): https://github.com/astral-sh/puffin/issues/406 + if !file + .requires_python .as_ref() - .is_some_and(|version| *version != sdist.version) + .map_or(true, |requires_python| { + requires_python.contains(self.interpreter.version()) + }) { - break; + continue; } - if requirement.is_satisfied_by(&sdist.version) { - best_version = Some(sdist.version.clone()); - best_sdist = Some(Dist::from_registry( - sdist.name, - sdist.version, - file, - index.clone(), - )); - } + best_version = Some(sdist.version.clone()); + best_sdist = Some(Dist::from_registry( + sdist.name, + sdist.version, + file, + index.clone(), + )); } } } @@ -229,7 +231,7 @@ enum Request { #[derive(Debug)] enum Response { /// The returned metadata for a package. - Package(Requirement, IndexUrl, SimpleJson), + Package(Requirement, IndexUrl, SimpleMetadata), } pub trait Reporter: Send + Sync { diff --git a/crates/puffin-resolver/src/resolver.rs b/crates/puffin-resolver/src/resolver.rs index 9492141f2..f3d5d6e0e 100644 --- a/crates/puffin-resolver/src/resolver.rs +++ b/crates/puffin-resolver/src/resolver.rs @@ -9,6 +9,7 @@ use chrono::{DateTime, Utc}; use futures::channel::mpsc::UnboundedReceiver; use futures::{pin_mut, FutureExt, StreamExt, TryFutureExt}; use fxhash::{FxHashMap, FxHashSet}; + use pubgrub::error::PubGrubError; use pubgrub::range::Range; use pubgrub::solver::{Incompatibility, State}; diff --git a/crates/puffin-resolver/src/version_map.rs b/crates/puffin-resolver/src/version_map.rs index f7a7fc5fe..58ff0fb3a 100644 --- a/crates/puffin-resolver/src/version_map.rs +++ b/crates/puffin-resolver/src/version_map.rs @@ -1,21 +1,21 @@ use std::collections::btree_map::Entry; use std::collections::BTreeMap; -use std::str::FromStr; use chrono::{DateTime, Utc}; use tracing::warn; -use distribution_filename::{SourceDistFilename, WheelFilename}; +use distribution_filename::DistFilename; use pep508_rs::MarkerEnvironment; use platform_tags::{TagPriority, Tags}; use puffin_interpreter::Interpreter; use puffin_macros::warn_once; use puffin_normalize::PackageName; -use pypi_types::{SimpleJson, Yanked}; +use pypi_types::Yanked; use crate::file::{DistFile, SdistFile, WheelFile}; use crate::pubgrub::PubGrubVersion; use crate::yanks::AllowedYanks; +use puffin_client::SimpleMetadata; /// A map from versions to distributions. #[derive(Debug, Default)] @@ -24,7 +24,7 @@ pub struct VersionMap(BTreeMap); impl VersionMap { /// Initialize a [`VersionMap`] from the given metadata. pub(crate) fn from_metadata( - metadata: SimpleJson, + metadata: SimpleMetadata, package_name: &PackageName, tags: &Tags, markers: &MarkerEnvironment, @@ -35,86 +35,79 @@ impl VersionMap { let mut version_map: BTreeMap = BTreeMap::default(); - // Group the distributions by version and kind, discarding any incompatible - // distributions. - for file in metadata.files { - // Only add dists compatible with the python version. This is relevant for source - // distributions which give no other indication of their compatibility and wheels which - // may be tagged `py3-none-any` but have `requires-python: ">=3.9"`. - // TODO(konstin): https://github.com/astral-sh/puffin/issues/406 - if let Some(requires_python) = file.requires_python.as_ref() { - // The interpreter and marker version are often the same, but can differ. For - // example, if the user is resolving against a target Python version passed in - // via the command-line, that version will differ from the interpreter version. - let interpreter_version = interpreter.version(); - let marker_version = &markers.python_version.version; - if !requires_python.contains(interpreter_version) - || !requires_python.contains(marker_version) - { - continue; - } - } - - // Support resolving as if it were an earlier timestamp, at least as long files have - // upload time information - if let Some(exclude_newer) = exclude_newer { - match file.upload_time.as_ref() { - Some(upload_time) if upload_time >= exclude_newer => { + // Collect compatible distributions. + for (version, files) in metadata { + for (filename, file) in files.all() { + // Only add dists compatible with the python version. This is relevant for source + // distributions which give no other indication of their compatibility and wheels which + // may be tagged `py3-none-any` but have `requires-python: ">=3.9"`. + // TODO(konstin): https://github.com/astral-sh/puffin/issues/406 + if let Some(requires_python) = file.requires_python.as_ref() { + // The interpreter and marker version are often the same, but can differ. For + // example, if the user is resolving against a target Python version passed in + // via the command-line, that version will differ from the interpreter version. + let interpreter_version = interpreter.version(); + let marker_version = &markers.python_version.version; + if !requires_python.contains(interpreter_version) + || !requires_python.contains(marker_version) + { continue; } - None => { - warn_once!( - "{} is missing an upload date, but user provided {}", - file.filename, - exclude_newer, - ); - continue; - } - _ => {} } - } - if let Ok(filename) = WheelFilename::from_str(file.filename.as_str()) { + // Support resolving as if it were an earlier timestamp, at least as long files have + // upload time information + if let Some(exclude_newer) = exclude_newer { + match file.upload_time.as_ref() { + Some(upload_time) if upload_time >= exclude_newer => { + continue; + } + None => { + warn_once!( + "{} is missing an upload date, but user provided {}", + file.filename, + exclude_newer, + ); + continue; + } + _ => {} + } + } + // When resolving, exclude yanked files. if file.yanked.as_ref().is_some_and(Yanked::is_yanked) { - if allowed_yanks.allowed(package_name, &filename.version) { + if allowed_yanks.allowed(package_name, &version) { warn!("Allowing yanked version: {}", file.filename); } else { continue; } } - let priority = filename.compatibility(tags); + match filename { + DistFilename::WheelFilename(filename) => { + let priority = filename.compatibility(tags); - match version_map.entry(filename.version.into()) { - Entry::Occupied(mut entry) => { - entry.get_mut().insert_built(WheelFile(file), priority); + match version_map.entry(version.clone().into()) { + Entry::Occupied(mut entry) => { + entry.get_mut().insert_built(WheelFile(file), priority); + } + Entry::Vacant(entry) => { + entry.insert(PrioritizedDistribution::from_built( + WheelFile(file), + priority, + )); + } + } } - Entry::Vacant(entry) => { - entry.insert(PrioritizedDistribution::from_built( - WheelFile(file), - priority, - )); - } - } - } else if let Ok(filename) = - SourceDistFilename::parse(file.filename.as_str(), package_name) - { - // When resolving, exclude yanked files. - if file.yanked.as_ref().is_some_and(Yanked::is_yanked) { - if allowed_yanks.allowed(package_name, &filename.version) { - warn!("Allowing yanked version: {}", file.filename); - } else { - continue; - } - } - - match version_map.entry(filename.version.into()) { - Entry::Occupied(mut entry) => { - entry.get_mut().insert_source(SdistFile(file)); - } - Entry::Vacant(entry) => { - entry.insert(PrioritizedDistribution::from_source(SdistFile(file))); + DistFilename::SourceDistFilename(_) => { + match version_map.entry(version.clone().into()) { + Entry::Occupied(mut entry) => { + entry.get_mut().insert_source(SdistFile(file)); + } + Entry::Vacant(entry) => { + entry.insert(PrioritizedDistribution::from_source(SdistFile(file))); + } + } } } }