Parse SimpleJson into categorized data in the client (#522)

Extends #517 with a suggestion from @konstin to parse the `SimpleJson`
into an intermediate type `SimpleMetadata(BTreeMap<Version,
VersionFiles>)` before converting to a `VersionMap`. This reduces the
number of times we need to parse the response. Additionally, we cache
the parsed response now instead of `SimpleJson`.

`VersionFiles` stores two vectors with
`WheelFilename`/`SourceDistFilename` and `File` tuples. These can be
iterated over together or separately. A new enum `DistFilename` was
added to capture the `SourceDistFilename` and `WheelFilename` variants
allowing iteration over both vectors.
This commit is contained in:
Zanie Blue 2023-12-07 11:04:47 -06:00 committed by GitHub
parent 5d3ce963b2
commit ef7be9103c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 233 additions and 132 deletions

1
Cargo.lock generated
View file

@ -2365,6 +2365,7 @@ dependencies = [
"http",
"http-cache-semantics",
"install-wheel-rs",
"pep440_rs 0.3.12",
"puffin-cache",
"puffin-fs",
"puffin-normalize",

View file

@ -1,5 +1,28 @@
use std::str::FromStr;
pub use source_dist::{SourceDistExtension, SourceDistFilename, SourceDistFilenameError};
pub use wheel::{WheelFilename, WheelFilenameError};
mod source_dist;
mod wheel;
#[derive(Debug, Clone)]
pub enum DistFilename {
SourceDistFilename(SourceDistFilename),
WheelFilename(WheelFilename),
}
impl DistFilename {
pub fn try_from_filename(
filename: &str,
package_name: &puffin_normalize::PackageName,
) -> Option<Self> {
if let Ok(filename) = WheelFilename::from_str(filename) {
Some(Self::WheelFilename(filename))
} else if let Ok(filename) = SourceDistFilename::parse(filename, package_name) {
Some(Self::SourceDistFilename(filename))
} else {
None
}
}
}

View file

@ -1,12 +1,14 @@
use std::fmt::{Display, Formatter};
use std::str::FromStr;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use thiserror::Error;
use pep440_rs::Version;
use puffin_normalize::{InvalidNameError, PackageName};
#[derive(Clone, Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum SourceDistExtension {
Zip,
TarGz,
@ -48,6 +50,7 @@ impl SourceDistExtension {
/// Note that this is a normalized and not an exact representation, keep the original string if you
/// need the latter.
#[derive(Clone, Debug, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SourceDistFilename {
pub name: PackageName,
pub version: Version,

View file

@ -370,7 +370,7 @@ pub enum CacheBucket {
/// * `simple-v0/pypi/<package_name>.json`
/// * `simple-v0/<digest(index_url)>/<package_name>.json`
///
/// TODO(konstin): Link to the json type after <https://github.com/astral-sh/puffin/pull/522/>
/// The response is parsed into [`puffin_client::SimpleMetadata`] before storage.
Simple,
}

View file

@ -7,6 +7,7 @@ edition = "2021"
distribution-filename = { path = "../distribution-filename" }
distribution-types = { path = "../distribution-types" }
install-wheel-rs = { path = "../install-wheel-rs" }
pep440_rs = { path = "../pep440-rs" }
puffin-cache = { path = "../puffin-cache" }
puffin-fs = { path = "../puffin-fs" }
puffin-normalize = { path = "../puffin-normalize" }

View file

@ -1,6 +1,6 @@
pub use cached_client::{CachedClient, CachedClientError, DataWithCachePolicy};
pub use error::Error;
pub use registry_client::{RegistryClient, RegistryClientBuilder};
pub use registry_client::{RegistryClient, RegistryClientBuilder, SimpleMetadata};
mod cached_client;
mod error;

View file

@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::fmt::Debug;
use std::path::Path;
use std::str::FromStr;
@ -8,15 +9,17 @@ use futures::TryStreamExt;
use reqwest::{Client, ClientBuilder, Response, StatusCode};
use reqwest_retry::policies::ExponentialBackoff;
use reqwest_retry::RetryTransientMiddleware;
use serde::{Deserialize, Serialize};
use tempfile::tempfile_in;
use tokio::io::BufWriter;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{debug, trace};
use url::Url;
use distribution_filename::WheelFilename;
use distribution_filename::{DistFilename, SourceDistFilename, WheelFilename};
use distribution_types::{BuiltDist, Metadata};
use install_wheel_rs::find_dist_info;
use pep440_rs::Version;
use puffin_cache::{digest, Cache, CacheBucket, CanonicalUrl, WheelCache};
use puffin_normalize::PackageName;
use pypi_types::{File, IndexUrl, IndexUrls, Metadata21, SimpleJson};
@ -122,7 +125,7 @@ impl RegistryClient {
pub async fn simple(
&self,
package_name: &PackageName,
) -> Result<(IndexUrl, SimpleJson), Error> {
) -> Result<(IndexUrl, SimpleMetadata), Error> {
if self.index_urls.no_index() {
return Err(Error::NoIndex(package_name.as_ref().to_string()));
}
@ -155,7 +158,8 @@ impl RegistryClient {
let bytes = response.bytes().await?;
let data: SimpleJson = serde_json::from_slice(bytes.as_ref())
.map_err(|err| Error::from_json_err(err, url))?;
Ok(data)
let metadata = SimpleMetadata::from_package_simple_json(package_name, data);
Ok(metadata)
};
let result = self
.client
@ -164,8 +168,8 @@ impl RegistryClient {
// Fetch from the index.
match result {
Ok(simple_json) => {
return Ok((index.clone(), simple_json));
Ok(simple_metadata) => {
return Ok((index.clone(), simple_metadata));
}
Err(CachedClientError::Client(Error::RequestError(err))) => {
if err.status() == Some(StatusCode::NOT_FOUND) {
@ -384,3 +388,76 @@ impl RegistryClient {
))
}
}
#[derive(Default, Debug, Serialize, Deserialize)]
pub struct VersionFiles {
pub wheels: Vec<(WheelFilename, File)>,
pub source_dists: Vec<(SourceDistFilename, File)>,
}
impl VersionFiles {
fn push(&mut self, filename: DistFilename, file: File) {
match filename {
DistFilename::WheelFilename(inner) => self.wheels.push((inner, file)),
DistFilename::SourceDistFilename(inner) => self.source_dists.push((inner, file)),
}
}
pub fn all(self) -> impl Iterator<Item = (DistFilename, File)> {
self.wheels
.into_iter()
.map(|(filename, file)| (DistFilename::WheelFilename(filename), file))
.chain(
self.source_dists
.into_iter()
.map(|(filename, file)| (DistFilename::SourceDistFilename(filename), file)),
)
}
}
#[derive(Default, Debug, Serialize, Deserialize)]
pub struct SimpleMetadata(BTreeMap<Version, VersionFiles>);
impl SimpleMetadata {
pub fn iter(&self) -> impl DoubleEndedIterator<Item = (&Version, &VersionFiles)> {
self.0.iter()
}
fn from_package_simple_json(package_name: &PackageName, simple_json: SimpleJson) -> Self {
let mut metadata = Self::default();
// Group the distributions by version and kind
for file in simple_json.files {
if let Some(filename) =
DistFilename::try_from_filename(file.filename.as_str(), package_name)
{
let version = match filename {
DistFilename::SourceDistFilename(ref inner) => &inner.version,
DistFilename::WheelFilename(ref inner) => &inner.version,
};
match metadata.0.entry(version.clone()) {
std::collections::btree_map::Entry::Occupied(mut entry) => {
entry.get_mut().push(filename, file);
}
std::collections::btree_map::Entry::Vacant(entry) => {
let mut files = VersionFiles::default();
files.push(filename, file);
entry.insert(files);
}
}
}
}
metadata
}
}
impl IntoIterator for SimpleMetadata {
type Item = (Version, VersionFiles);
type IntoIter = std::collections::btree_map::IntoIter<Version, VersionFiles>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}

View file

@ -3,21 +3,19 @@
//! This is similar to running `pip install` with the `--no-deps` flag.
use std::hash::BuildHasherDefault;
use std::str::FromStr;
use anyhow::Result;
use futures::StreamExt;
use fxhash::FxHashMap;
use distribution_filename::{SourceDistFilename, WheelFilename};
use distribution_types::Dist;
use pep440_rs::Version;
use pep508_rs::{Requirement, VersionOrUrl};
use platform_tags::{TagPriority, Tags};
use puffin_client::RegistryClient;
use puffin_client::{RegistryClient, SimpleMetadata};
use puffin_interpreter::Interpreter;
use puffin_normalize::PackageName;
use pypi_types::{File, IndexUrl, SimpleJson};
use pypi_types::IndexUrl;
use crate::error::ResolveError;
use crate::resolution::Resolution;
@ -108,8 +106,7 @@ impl<'a> DistFinder<'a> {
match result {
Response::Package(requirement, index, metadata) => {
// Pick a version that satisfies the requirement.
let Some(distribution) = self.select(&requirement, &index, metadata.files)
else {
let Some(distribution) = self.select(&requirement, &index, metadata) else {
return Err(ResolveError::NotFound(requirement));
};
@ -141,77 +138,82 @@ impl<'a> DistFinder<'a> {
&self,
requirement: &Requirement,
index: &IndexUrl,
files: Vec<File>,
metadata: SimpleMetadata,
) -> Option<Dist> {
let mut best_version: Option<Version> = None;
let mut best_wheel: Option<(Dist, TagPriority)> = None;
let mut best_sdist: Option<Dist> = None;
for file in files.into_iter().rev() {
// Only add dists compatible with the python version.
// This is relevant for source dists which give no other indication of their
// compatibility and wheels which may be tagged `py3-none-any` but
// have `requires-python: ">=3.9"`
// TODO(konstin): https://github.com/astral-sh/puffin/issues/406
if file
.requires_python
for (version, files) in metadata.into_iter().rev() {
// If we iterated past the first-compatible version, break.
if best_version
.as_ref()
.is_some_and(|requires_python| {
!requires_python.contains(self.interpreter.version())
})
.is_some_and(|best_version| *best_version != version)
{
break;
}
// If the version does not satisfy the requirement, continue.
if !requirement.is_satisfied_by(&version) {
continue;
}
// Find the most-compatible wheel.
if let Ok(wheel) = WheelFilename::from_str(file.filename.as_str()) {
// If we iterated past the first-compatible version, break.
if best_version
// Find the most-compatible wheel
for (wheel, file) in files.wheels {
// Only add dists compatible with the python version.
// This is relevant for source dists which give no other indication of their
// compatibility and wheels which may be tagged `py3-none-any` but
// have `requires-python: ">=3.9"`
// TODO(konstin): https://github.com/astral-sh/puffin/issues/406
if !file
.requires_python
.as_ref()
.is_some_and(|version| *version != wheel.version)
.map_or(true, |requires_python| {
requires_python.contains(self.interpreter.version())
})
{
break;
continue;
}
if requirement.is_satisfied_by(&wheel.version) {
best_version = Some(wheel.version.clone());
if let Some(priority) = wheel.compatibility(self.tags) {
if best_wheel
.as_ref()
.map_or(true, |(.., existing)| priority > *existing)
{
best_wheel = Some((
Dist::from_registry(wheel.name, wheel.version, file, index.clone()),
priority,
));
}
best_version = Some(version.clone());
if let Some(priority) = wheel.compatibility(self.tags) {
if best_wheel
.as_ref()
.map_or(true, |(.., existing)| priority > *existing)
{
best_wheel = Some((
Dist::from_registry(wheel.name, wheel.version, file, index.clone()),
priority,
));
}
}
continue;
}
// Find the most-compatible sdist, if no wheel was found.
if best_wheel.is_none() {
if let Ok(sdist) =
SourceDistFilename::parse(file.filename.as_str(), &requirement.name)
{
// If we iterated past the first-compatible version, break.
if best_version
for (sdist, file) in files.source_dists {
// Only add dists compatible with the python version.
// This is relevant for source dists which give no other indication of their
// compatibility and wheels which may be tagged `py3-none-any` but
// have `requires-python: ">=3.9"`
// TODO(konstin): https://github.com/astral-sh/puffin/issues/406
if !file
.requires_python
.as_ref()
.is_some_and(|version| *version != sdist.version)
.map_or(true, |requires_python| {
requires_python.contains(self.interpreter.version())
})
{
break;
continue;
}
if requirement.is_satisfied_by(&sdist.version) {
best_version = Some(sdist.version.clone());
best_sdist = Some(Dist::from_registry(
sdist.name,
sdist.version,
file,
index.clone(),
));
}
best_version = Some(sdist.version.clone());
best_sdist = Some(Dist::from_registry(
sdist.name,
sdist.version,
file,
index.clone(),
));
}
}
}
@ -229,7 +231,7 @@ enum Request {
#[derive(Debug)]
enum Response {
/// The returned metadata for a package.
Package(Requirement, IndexUrl, SimpleJson),
Package(Requirement, IndexUrl, SimpleMetadata),
}
pub trait Reporter: Send + Sync {

View file

@ -9,6 +9,7 @@ use chrono::{DateTime, Utc};
use futures::channel::mpsc::UnboundedReceiver;
use futures::{pin_mut, FutureExt, StreamExt, TryFutureExt};
use fxhash::{FxHashMap, FxHashSet};
use pubgrub::error::PubGrubError;
use pubgrub::range::Range;
use pubgrub::solver::{Incompatibility, State};

View file

@ -1,21 +1,21 @@
use std::collections::btree_map::Entry;
use std::collections::BTreeMap;
use std::str::FromStr;
use chrono::{DateTime, Utc};
use tracing::warn;
use distribution_filename::{SourceDistFilename, WheelFilename};
use distribution_filename::DistFilename;
use pep508_rs::MarkerEnvironment;
use platform_tags::{TagPriority, Tags};
use puffin_interpreter::Interpreter;
use puffin_macros::warn_once;
use puffin_normalize::PackageName;
use pypi_types::{SimpleJson, Yanked};
use pypi_types::Yanked;
use crate::file::{DistFile, SdistFile, WheelFile};
use crate::pubgrub::PubGrubVersion;
use crate::yanks::AllowedYanks;
use puffin_client::SimpleMetadata;
/// A map from versions to distributions.
#[derive(Debug, Default)]
@ -24,7 +24,7 @@ pub struct VersionMap(BTreeMap<PubGrubVersion, PrioritizedDistribution>);
impl VersionMap {
/// Initialize a [`VersionMap`] from the given metadata.
pub(crate) fn from_metadata(
metadata: SimpleJson,
metadata: SimpleMetadata,
package_name: &PackageName,
tags: &Tags,
markers: &MarkerEnvironment,
@ -35,86 +35,79 @@ impl VersionMap {
let mut version_map: BTreeMap<PubGrubVersion, PrioritizedDistribution> =
BTreeMap::default();
// Group the distributions by version and kind, discarding any incompatible
// distributions.
for file in metadata.files {
// Only add dists compatible with the python version. This is relevant for source
// distributions which give no other indication of their compatibility and wheels which
// may be tagged `py3-none-any` but have `requires-python: ">=3.9"`.
// TODO(konstin): https://github.com/astral-sh/puffin/issues/406
if let Some(requires_python) = file.requires_python.as_ref() {
// The interpreter and marker version are often the same, but can differ. For
// example, if the user is resolving against a target Python version passed in
// via the command-line, that version will differ from the interpreter version.
let interpreter_version = interpreter.version();
let marker_version = &markers.python_version.version;
if !requires_python.contains(interpreter_version)
|| !requires_python.contains(marker_version)
{
continue;
}
}
// Support resolving as if it were an earlier timestamp, at least as long files have
// upload time information
if let Some(exclude_newer) = exclude_newer {
match file.upload_time.as_ref() {
Some(upload_time) if upload_time >= exclude_newer => {
// Collect compatible distributions.
for (version, files) in metadata {
for (filename, file) in files.all() {
// Only add dists compatible with the python version. This is relevant for source
// distributions which give no other indication of their compatibility and wheels which
// may be tagged `py3-none-any` but have `requires-python: ">=3.9"`.
// TODO(konstin): https://github.com/astral-sh/puffin/issues/406
if let Some(requires_python) = file.requires_python.as_ref() {
// The interpreter and marker version are often the same, but can differ. For
// example, if the user is resolving against a target Python version passed in
// via the command-line, that version will differ from the interpreter version.
let interpreter_version = interpreter.version();
let marker_version = &markers.python_version.version;
if !requires_python.contains(interpreter_version)
|| !requires_python.contains(marker_version)
{
continue;
}
None => {
warn_once!(
"{} is missing an upload date, but user provided {}",
file.filename,
exclude_newer,
);
continue;
}
_ => {}
}
}
if let Ok(filename) = WheelFilename::from_str(file.filename.as_str()) {
// Support resolving as if it were an earlier timestamp, at least as long files have
// upload time information
if let Some(exclude_newer) = exclude_newer {
match file.upload_time.as_ref() {
Some(upload_time) if upload_time >= exclude_newer => {
continue;
}
None => {
warn_once!(
"{} is missing an upload date, but user provided {}",
file.filename,
exclude_newer,
);
continue;
}
_ => {}
}
}
// When resolving, exclude yanked files.
if file.yanked.as_ref().is_some_and(Yanked::is_yanked) {
if allowed_yanks.allowed(package_name, &filename.version) {
if allowed_yanks.allowed(package_name, &version) {
warn!("Allowing yanked version: {}", file.filename);
} else {
continue;
}
}
let priority = filename.compatibility(tags);
match filename {
DistFilename::WheelFilename(filename) => {
let priority = filename.compatibility(tags);
match version_map.entry(filename.version.into()) {
Entry::Occupied(mut entry) => {
entry.get_mut().insert_built(WheelFile(file), priority);
match version_map.entry(version.clone().into()) {
Entry::Occupied(mut entry) => {
entry.get_mut().insert_built(WheelFile(file), priority);
}
Entry::Vacant(entry) => {
entry.insert(PrioritizedDistribution::from_built(
WheelFile(file),
priority,
));
}
}
}
Entry::Vacant(entry) => {
entry.insert(PrioritizedDistribution::from_built(
WheelFile(file),
priority,
));
}
}
} else if let Ok(filename) =
SourceDistFilename::parse(file.filename.as_str(), package_name)
{
// When resolving, exclude yanked files.
if file.yanked.as_ref().is_some_and(Yanked::is_yanked) {
if allowed_yanks.allowed(package_name, &filename.version) {
warn!("Allowing yanked version: {}", file.filename);
} else {
continue;
}
}
match version_map.entry(filename.version.into()) {
Entry::Occupied(mut entry) => {
entry.get_mut().insert_source(SdistFile(file));
}
Entry::Vacant(entry) => {
entry.insert(PrioritizedDistribution::from_source(SdistFile(file)));
DistFilename::SourceDistFilename(_) => {
match version_map.entry(version.clone().into()) {
Entry::Occupied(mut entry) => {
entry.get_mut().insert_source(SdistFile(file));
}
Entry::Vacant(entry) => {
entry.insert(PrioritizedDistribution::from_source(SdistFile(file)));
}
}
}
}
}