mirror of
https://github.com/astral-sh/uv.git
synced 2025-11-13 17:25:41 +00:00
puffin-resolver: make VersionMap construction lazy
That is, a `PrioritizedDistribution` for a specific version of a package is not actually materialized in memory until a corresponding `VersionMap::get` call is made for that version. Similarly, iteration lazily materializes distributions as it moves through the map. It specifically does not materialize everything first. The main reason why this is effective is that an `OwnedArchive<SimpleMetadata>` represents a zero-copy (other than reading the source file) version of `SimpleMetadata` that is really just a `Vec<u8>` internally. The problem with `VersionMap` construction previously is that it had to eagerly materialize a `SimpleMetadata` in memory before anything else, which defeats a large part of the purpose of zero-copy deserialization. By making more of `VersionMap` construction itself lazy, we permit doing some parts of resolution without necessarily fully deserializing a `SimpleMetadata` into memory. Indeed, with this commit, in the warm cached case, a `SimpleMetadata` is itself never materialized fully in memory. This does not completely and totally fully realize the benefits of zero-copy deserialization. For example, we are likely still building lots of distributions in memory that we don't actually need in some cases. Perhaps in cases where no resolution exists, or when one needs to iterate over large portions of the total versions published for a package.
This commit is contained in:
parent
e2f3ad0e28
commit
8102980192
5 changed files with 302 additions and 77 deletions
|
|
@ -1,4 +1,5 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use tracing::{instrument, warn};
|
||||
|
|
@ -7,62 +8,269 @@ use distribution_filename::DistFilename;
|
|||
use distribution_types::{Dist, IndexUrl, PrioritizedDistribution, ResolvableDist};
|
||||
use pep440_rs::Version;
|
||||
use platform_tags::Tags;
|
||||
use puffin_client::{FlatDistributions, OwnedArchive, SimpleMetadata, SimpleMetadatum};
|
||||
use puffin_client::{FlatDistributions, OwnedArchive, SimpleMetadata, VersionFiles};
|
||||
use puffin_normalize::PackageName;
|
||||
use puffin_traits::NoBinary;
|
||||
use puffin_warnings::warn_user_once;
|
||||
use pypi_types::{Hashes, Yanked};
|
||||
use pypi_types::Hashes;
|
||||
use rkyv::{de::deserializers::SharedDeserializeMap, Deserialize};
|
||||
|
||||
use crate::python_requirement::PythonRequirement;
|
||||
|
||||
/// A map from versions to distributions.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct VersionMap(BTreeMap<Version, PrioritizedDistribution>);
|
||||
#[derive(Debug)]
|
||||
pub struct VersionMap {
|
||||
inner: VersionMapInner,
|
||||
}
|
||||
|
||||
impl VersionMap {
|
||||
/// Initialize a [`VersionMap`] from the given metadata.
|
||||
#[instrument(skip_all, fields(package_name))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn from_metadata(
|
||||
raw_metadata: OwnedArchive<SimpleMetadata>,
|
||||
simple_metadata: OwnedArchive<SimpleMetadata>,
|
||||
package_name: &PackageName,
|
||||
index: &IndexUrl,
|
||||
tags: &Tags,
|
||||
python_requirement: &PythonRequirement,
|
||||
exclude_newer: Option<&DateTime<Utc>>,
|
||||
mut flat_index: Option<FlatDistributions>,
|
||||
flat_index: Option<FlatDistributions>,
|
||||
no_binary: &NoBinary,
|
||||
) -> Self {
|
||||
// NOTE: We should experiment with refactoring the code
|
||||
// below to work on rkyv::Archived<SimpleMetadata>. More
|
||||
// specifically, we may want to adjust VersionMap itself to
|
||||
// contain an Archived<SimpleMetadata> of some kind, that in
|
||||
// turn is used in the resolver. The idea here is to avoid
|
||||
// eagerly deserializing all of the metadata for a package
|
||||
// up-front.
|
||||
let metadata = OwnedArchive::deserialize(&raw_metadata);
|
||||
|
||||
let mut version_map = BTreeMap::new();
|
||||
|
||||
// Check if binaries are allowed for this package
|
||||
let mut map = BTreeMap::new();
|
||||
// Create stubs for each entry in simple metadata. The full conversion
|
||||
// from a `VersionFiles` to a PrioritizedDistribution for each version
|
||||
// isn't done until that specific version is requested.
|
||||
for (datum_index, datum) in simple_metadata.iter().enumerate() {
|
||||
let version: Version = datum
|
||||
.version
|
||||
.deserialize(&mut SharedDeserializeMap::new())
|
||||
.expect("archived version always deserializes");
|
||||
map.insert(
|
||||
version,
|
||||
LazyPrioritizedDistribution::OnlySimple(SimplePrioritizedDistribution {
|
||||
datum_index,
|
||||
dist: OnceLock::new(),
|
||||
}),
|
||||
);
|
||||
}
|
||||
// If a set of flat distributions have been given, we need to add those
|
||||
// to our map of entries as well.
|
||||
for (version, prioritized_dist) in flat_index.into_iter().flatten() {
|
||||
match map.entry(version) {
|
||||
Entry::Vacant(e) => {
|
||||
e.insert(LazyPrioritizedDistribution::OnlyFlat(prioritized_dist));
|
||||
}
|
||||
// When there is both a `VersionFiles` (from the "simple"
|
||||
// metadata) and a flat distribution for the same version of
|
||||
// a package, we store both and "merge" them into a single
|
||||
// `PrioritizedDistribution` upon access later.
|
||||
Entry::Occupied(e) => match e.remove_entry() {
|
||||
(version, LazyPrioritizedDistribution::OnlySimple(simple_dist)) => {
|
||||
map.insert(
|
||||
version,
|
||||
LazyPrioritizedDistribution::Both {
|
||||
flat: prioritized_dist,
|
||||
simple: simple_dist,
|
||||
},
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
},
|
||||
}
|
||||
}
|
||||
// Check if binaries are allowed for this package.
|
||||
let no_binary = match no_binary {
|
||||
NoBinary::None => false,
|
||||
NoBinary::All => true,
|
||||
NoBinary::Packages(packages) => packages.contains(package_name),
|
||||
};
|
||||
VersionMap {
|
||||
inner: VersionMapInner::Lazy(VersionMapLazy {
|
||||
map,
|
||||
simple_metadata,
|
||||
no_binary,
|
||||
index: index.clone(),
|
||||
tags: tags.clone(),
|
||||
python_requirement: python_requirement.clone(),
|
||||
exclude_newer: exclude_newer.copied(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
// Collect compatible distributions.
|
||||
for SimpleMetadatum { version, files } in metadata {
|
||||
// If we have packages of the same name from find links, give them
|
||||
// priority, otherwise start with an empty priority dist.
|
||||
let mut priority_dist = flat_index
|
||||
.as_mut()
|
||||
.and_then(|flat_index| flat_index.remove(&version))
|
||||
.unwrap_or_default();
|
||||
/// Return the [`DistFile`] for the given version, if any.
|
||||
pub(crate) fn get(&self, version: &Version) -> Option<ResolvableDist> {
|
||||
self.get_with_version(version)
|
||||
.map(|(_, resolvable_dist)| resolvable_dist)
|
||||
}
|
||||
|
||||
/// Return the [`DistFile`] and the `Version` from the map for the given
|
||||
/// version, if any.
|
||||
///
|
||||
/// This is useful when you depend on access to the specific `Version`
|
||||
/// stored in this map. For example, the versions `1.2.0` and `1.2` are
|
||||
/// semantically equivalent, but when converted to strings, they are
|
||||
/// distinct.
|
||||
pub(crate) fn get_with_version<'a>(
|
||||
&'a self,
|
||||
version: &Version,
|
||||
) -> Option<(&'a Version, ResolvableDist)> {
|
||||
match self.inner {
|
||||
VersionMapInner::Eager(ref map) => map
|
||||
.get_key_value(version)
|
||||
.and_then(|(version, dist)| Some((version, dist.get()?))),
|
||||
VersionMapInner::Lazy(ref lazy) => lazy
|
||||
.get_with_version(version)
|
||||
.and_then(|(version, dist)| Some((version, dist.get()?))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return an iterator over the versions and distributions.
|
||||
pub(crate) fn iter(&self) -> impl DoubleEndedIterator<Item = (&Version, ResolvableDist)> {
|
||||
match self.inner {
|
||||
VersionMapInner::Eager { ref map } => either::Either::Left(
|
||||
map.iter()
|
||||
.filter_map(|(version, dist)| Some((version, dist.get()?))),
|
||||
),
|
||||
VersionMapInner::Lazy(ref lazy) => {
|
||||
either::Either::Right(lazy.map.iter().filter_map(|(version, lazy_dist)| {
|
||||
Some((version, lazy.get_lazy(lazy_dist)?.get()?))
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the [`Hashes`] for the given version, if any.
|
||||
pub(crate) fn hashes(&self, version: &Version) -> Vec<Hashes> {
|
||||
match self.inner {
|
||||
VersionMapInner::Eager(ref map) => map
|
||||
.get(version)
|
||||
.map(|file| file.hashes().to_vec())
|
||||
.unwrap_or_default(),
|
||||
VersionMapInner::Lazy(ref lazy) => lazy
|
||||
.get(version)
|
||||
.map(|file| file.hashes().to_vec())
|
||||
.unwrap_or_default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the total number of distinct versions in this map.
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
match self.inner {
|
||||
VersionMapInner::Eager(ref map) => map.len(),
|
||||
VersionMapInner::Lazy(VersionMapLazy { ref map, .. }) => map.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FlatDistributions> for VersionMap {
|
||||
fn from(flat_index: FlatDistributions) -> Self {
|
||||
VersionMap {
|
||||
inner: VersionMapInner::Eager(flat_index.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The kind of internal version map we have.
|
||||
#[derive(Debug)]
|
||||
enum VersionMapInner {
|
||||
/// All distributions are fully materialized in memory.
|
||||
///
|
||||
/// This usually happens when one needs a `VersionMap` from a
|
||||
/// `FlatDistributions`.
|
||||
Eager(BTreeMap<Version, PrioritizedDistribution>),
|
||||
/// Some distributions might be fully materialized (i.e., by initializing
|
||||
/// a `VersionMap` with a `FlatDistributions`), but some distributions
|
||||
/// might still be in their "raw" `SimpleMetadata` format. In this case, a
|
||||
/// `PrioritizedDistribution` isn't actually created in memory until the
|
||||
/// specific version has been requested.
|
||||
Lazy(VersionMapLazy),
|
||||
}
|
||||
|
||||
/// A map that lazily materializes some prioritized distributions upon access.
|
||||
///
|
||||
/// The idea here is that some packages have a lot of versions published, and
|
||||
/// needing to materialize a full `VersionMap` with all corresponding metadata
|
||||
/// for every version in memory is expensive. Since a `SimpleMetadata` can be
|
||||
/// materialized with very little cost (via `rkyv` in the warm cached case),
|
||||
/// avoiding another conversion step into a fully filled out `VersionMap` can
|
||||
/// provide substantial savings in some cases.
|
||||
#[derive(Debug)]
|
||||
struct VersionMapLazy {
|
||||
/// A map from version to possibly-initialized distribution.
|
||||
map: BTreeMap<Version, LazyPrioritizedDistribution>,
|
||||
/// The raw simple metadata from which `PrioritizedDistribution`s should
|
||||
/// be constructed.
|
||||
simple_metadata: OwnedArchive<SimpleMetadata>,
|
||||
/// When true, wheels aren't allowed.
|
||||
no_binary: bool,
|
||||
/// The URL of the index where this package came from.
|
||||
index: IndexUrl,
|
||||
/// The set of compatibility tags that determines whether a wheel is usable
|
||||
/// in the current environment.
|
||||
tags: Tags,
|
||||
/// The version of Python active in the current environment. This is used
|
||||
/// to determine whether a package's Python version constraint (if one
|
||||
/// exists) is satisfied or not.
|
||||
python_requirement: PythonRequirement,
|
||||
/// Whether files newer than this timestamp should be excluded or not.
|
||||
exclude_newer: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
impl VersionMapLazy {
|
||||
/// Returns the distribution for the given version, if it exists.
|
||||
fn get(&self, version: &Version) -> Option<&PrioritizedDistribution> {
|
||||
self.get_with_version(version)
|
||||
.map(|(_, prioritized_dist)| prioritized_dist)
|
||||
}
|
||||
|
||||
/// Returns the distribution for the given version along with the version
|
||||
/// in this map, if it exists.
|
||||
fn get_with_version(&self, version: &Version) -> Option<(&Version, &PrioritizedDistribution)> {
|
||||
let (version, lazy_dist) = self.map.get_key_value(version)?;
|
||||
let priority_dist = self.get_lazy(lazy_dist)?;
|
||||
Some((version, priority_dist))
|
||||
}
|
||||
|
||||
/// Given a reference to a possibly-initialized distribution that is in
|
||||
/// this lazy map, return the corresponding distribution.
|
||||
///
|
||||
/// When both a flat and simple distribution are present internally, they
|
||||
/// are merged automatically.
|
||||
fn get_lazy<'p>(
|
||||
&'p self,
|
||||
lazy_dist: &'p LazyPrioritizedDistribution,
|
||||
) -> Option<&'p PrioritizedDistribution> {
|
||||
match *lazy_dist {
|
||||
LazyPrioritizedDistribution::OnlyFlat(ref dist) => Some(dist),
|
||||
LazyPrioritizedDistribution::OnlySimple(ref dist) => self.get_simple(None, dist),
|
||||
LazyPrioritizedDistribution::Both {
|
||||
ref flat,
|
||||
ref simple,
|
||||
} => self.get_simple(Some(flat), simple),
|
||||
}
|
||||
}
|
||||
|
||||
/// Given an optional starting point, return the final form of the
|
||||
/// given simple distribution. If it wasn't initialized yet, then this
|
||||
/// initializes it. If the distribution would otherwise be empty, this
|
||||
/// returns `None`.
|
||||
fn get_simple<'p>(
|
||||
&'p self,
|
||||
init: Option<&'p PrioritizedDistribution>,
|
||||
simple: &'p SimplePrioritizedDistribution,
|
||||
) -> Option<&'p PrioritizedDistribution> {
|
||||
let get_or_init = || {
|
||||
let files: VersionFiles = self
|
||||
.simple_metadata
|
||||
.datum(simple.datum_index)
|
||||
.expect("index to lazy dist is correct")
|
||||
.files
|
||||
.deserialize(&mut SharedDeserializeMap::new())
|
||||
.expect("archived version files should deserialize");
|
||||
let mut priority_dist = init.cloned().unwrap_or_default();
|
||||
for (filename, file) in files.all() {
|
||||
// Support resolving as if it were an earlier timestamp, at least as long files have
|
||||
// upload time information.
|
||||
if let Some(exclude_newer) = exclude_newer {
|
||||
if let Some(exclude_newer) = self.exclude_newer {
|
||||
match file.upload_time_utc_ms.as_ref() {
|
||||
Some(&upload_time) if upload_time >= exclude_newer.timestamp_millis() => {
|
||||
continue;
|
||||
|
|
@ -77,36 +285,30 @@ impl VersionMap {
|
|||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let yanked = if let Some(ref yanked) = file.yanked {
|
||||
yanked.clone()
|
||||
} else {
|
||||
Yanked::default()
|
||||
};
|
||||
|
||||
// Prioritize amongst all available files.
|
||||
let yanked = file.yanked.clone().unwrap_or_default();
|
||||
let requires_python = file.requires_python.clone();
|
||||
let hash = file.hashes.clone();
|
||||
match filename {
|
||||
DistFilename::WheelFilename(filename) => {
|
||||
// If pre-built binaries are disabled, skip this wheel
|
||||
if no_binary {
|
||||
if self.no_binary {
|
||||
continue;
|
||||
}
|
||||
|
||||
// To be compatible, the wheel must both have compatible tags _and_ have a
|
||||
// compatible Python requirement.
|
||||
let priority = filename.compatibility(tags).filter(|_| {
|
||||
// To be compatible, the wheel must both have
|
||||
// compatible tags _and_ have a compatible Python
|
||||
// requirement.
|
||||
let priority = filename.compatibility(&self.tags).filter(|_| {
|
||||
file.requires_python
|
||||
.as_ref()
|
||||
.map_or(true, |requires_python| {
|
||||
requires_python.contains(python_requirement.target())
|
||||
requires_python.contains(self.python_requirement.target())
|
||||
})
|
||||
});
|
||||
let dist = Dist::from_registry(
|
||||
DistFilename::WheelFilename(filename),
|
||||
file,
|
||||
index.clone(),
|
||||
self.index.clone(),
|
||||
);
|
||||
priority_dist.insert_built(
|
||||
dist,
|
||||
|
|
@ -120,45 +322,53 @@ impl VersionMap {
|
|||
let dist = Dist::from_registry(
|
||||
DistFilename::SourceDistFilename(filename),
|
||||
file,
|
||||
index.clone(),
|
||||
self.index.clone(),
|
||||
);
|
||||
priority_dist.insert_source(dist, requires_python, yanked, Some(hash));
|
||||
}
|
||||
}
|
||||
}
|
||||
version_map.insert(version, priority_dist);
|
||||
}
|
||||
// Add any left over packages from the version map that we didn't visit
|
||||
// above via `SimpleMetadata`.
|
||||
if let Some(flat_index) = flat_index {
|
||||
version_map.extend(flat_index.into_iter());
|
||||
}
|
||||
Self(version_map)
|
||||
}
|
||||
|
||||
/// Return the [`DistFile`] for the given version, if any.
|
||||
pub(crate) fn get(&self, version: &Version) -> Option<ResolvableDist> {
|
||||
self.0.get(version).and_then(PrioritizedDistribution::get)
|
||||
}
|
||||
|
||||
/// Return an iterator over the versions and distributions.
|
||||
pub(crate) fn iter(&self) -> impl DoubleEndedIterator<Item = (&Version, ResolvableDist)> {
|
||||
self.0
|
||||
.iter()
|
||||
.filter_map(|(version, dist)| Some((version, dist.get()?)))
|
||||
}
|
||||
|
||||
/// Return the [`Hashes`] for the given version, if any.
|
||||
pub(crate) fn hashes(&self, version: &Version) -> Vec<Hashes> {
|
||||
self.0
|
||||
.get(version)
|
||||
.map(|file| file.hashes().to_vec())
|
||||
.unwrap_or_default()
|
||||
if priority_dist.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(priority_dist)
|
||||
}
|
||||
};
|
||||
simple.dist.get_or_init(get_or_init).as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FlatDistributions> for VersionMap {
|
||||
fn from(flat_index: FlatDistributions) -> Self {
|
||||
Self(flat_index.into())
|
||||
}
|
||||
/// Represents a possibly initialized `PrioritizedDistribution` for
|
||||
/// a single version of a package.
|
||||
#[derive(Debug)]
|
||||
enum LazyPrioritizedDistribution {
|
||||
/// Represents a eagerly constructed distribution from a
|
||||
/// `FlatDistributions`.
|
||||
OnlyFlat(PrioritizedDistribution),
|
||||
/// Represents a lazyily constructed distribution from an index into a
|
||||
/// `VersionFiles` from `SimpleMetadata`.
|
||||
OnlySimple(SimplePrioritizedDistribution),
|
||||
/// Combines the above. This occurs when we have data from both a flat
|
||||
/// distribution and a simple distribution.
|
||||
Both {
|
||||
flat: PrioritizedDistribution,
|
||||
simple: SimplePrioritizedDistribution,
|
||||
},
|
||||
}
|
||||
|
||||
/// Represents a lazily initialized `PrioritizedDistribution`.
|
||||
#[derive(Debug)]
|
||||
struct SimplePrioritizedDistribution {
|
||||
/// An offset into `SimpleMetadata` corresponding to a `SimpleMetadatum`.
|
||||
/// This provides access to a `VersionFiles` that is used to construct a
|
||||
/// `PrioritizedDistribution`.
|
||||
datum_index: usize,
|
||||
/// A lazily initialized distribution.
|
||||
///
|
||||
/// Note that the `Option` does not represent the initialization state.
|
||||
/// The `Option` can be `None` even after initialization, for example,
|
||||
/// if initialization could not find any usable files from which to
|
||||
/// construct a distribution. (One easy way to effect this, at the time
|
||||
/// of writing, is to use `--exclude-newer 1900-01-01`.)
|
||||
dist: OnceLock<Option<PrioritizedDistribution>>,
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue