mirror of
https://github.com/astral-sh/uv.git
synced 2025-08-04 19:08:04 +00:00
Wheel metadata refactor (#462)
A consistent cache structure for remote wheel metadata: * `<wheel metadata cache>/pypi/foo-1.0.0-py3-none-any.json` * `<wheel metadata cache>/<digest(index-url)>/foo-1.0.0-py3-none-any.json` * `<wheel metadata cache>/url/<digest(url)>/foo-1.0.0-py3-none-any.json` The source dist caching will use a similar structure (#468).
This commit is contained in:
parent
d3e9e1783f
commit
f0841cdb6e
18 changed files with 153 additions and 47 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
@ -874,6 +874,7 @@ name = "distribution-types"
|
|||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"distribution-filename",
|
||||
"fs-err",
|
||||
"pep440_rs 0.3.12",
|
||||
"puffin-cache",
|
||||
|
@ -2371,8 +2372,10 @@ version = "0.0.1"
|
|||
dependencies = [
|
||||
"clap",
|
||||
"directories",
|
||||
"distribution-filename",
|
||||
"fs-err",
|
||||
"hex",
|
||||
"pypi-types",
|
||||
"seahash",
|
||||
"tempfile",
|
||||
"url",
|
||||
|
@ -2447,6 +2450,7 @@ dependencies = [
|
|||
"http-cache-reqwest",
|
||||
"http-cache-semantics",
|
||||
"install-wheel-rs",
|
||||
"puffin-cache",
|
||||
"puffin-normalize",
|
||||
"pypi-types",
|
||||
"reqwest",
|
||||
|
@ -2454,6 +2458,7 @@ dependencies = [
|
|||
"reqwest-retry",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
|
@ -2618,8 +2623,10 @@ dependencies = [
|
|||
name = "puffin-macros"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"colored",
|
||||
"fxhash",
|
||||
"once_cell",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -10,6 +10,7 @@ authors = { workspace = true }
|
|||
license = { workspace = true }
|
||||
|
||||
[dependencies]
|
||||
distribution-filename = { path = "../distribution-filename" }
|
||||
pep440_rs = { path = "../pep440-rs" }
|
||||
puffin-cache = { path = "../puffin-cache" }
|
||||
puffin-git = { path = "../puffin-git" }
|
||||
|
|
|
@ -80,7 +80,7 @@ impl CachedDist {
|
|||
path,
|
||||
}),
|
||||
Dist::Built(BuiltDist::DirectUrl(dist)) => Self::Url(CachedDirectUrlDist {
|
||||
name: dist.name,
|
||||
name: dist.filename.name,
|
||||
url: dist.url,
|
||||
path,
|
||||
}),
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use distribution_filename::WheelFilename;
|
||||
use url::Url;
|
||||
|
||||
use pep440_rs::Version;
|
||||
|
@ -68,7 +70,9 @@ pub struct RegistryBuiltDist {
|
|||
/// A built distribution (wheel) that exists at an arbitrary URL.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DirectUrlBuiltDist {
|
||||
pub name: PackageName,
|
||||
/// We require that wheel urls end in the full wheel filename, e.g.
|
||||
/// `https://example.org/packages/flask-3.0.0-py3-none-any.whl`
|
||||
pub filename: WheelFilename,
|
||||
pub url: Url,
|
||||
}
|
||||
|
||||
|
@ -84,6 +88,8 @@ pub struct RegistrySourceDist {
|
|||
/// A source distribution that exists at an arbitrary URL.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DirectUrlSourceDist {
|
||||
/// Unlike [`DirectUrlBuiltDist`], we can't require a full filename with a version here, people
|
||||
/// like using e.g. `foo @ https://github.com/org/repo/archive/master.zip`
|
||||
pub name: PackageName,
|
||||
pub url: Url,
|
||||
}
|
||||
|
@ -120,13 +126,15 @@ impl Dist {
|
|||
|
||||
/// Create a [`Dist`] for a URL-based distribution.
|
||||
pub fn from_url(name: PackageName, url: Url) -> Self {
|
||||
// The part after the last slash
|
||||
let filename = url
|
||||
.path()
|
||||
.rsplit_once('/')
|
||||
.map_or(url.path(), |(_path, filename)| filename);
|
||||
if url.scheme().starts_with("git+") {
|
||||
Self::Source(SourceDist::Git(GitSourceDist { name, url }))
|
||||
} else if Path::new(url.path())
|
||||
.extension()
|
||||
.is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
|
||||
{
|
||||
Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { name, url }))
|
||||
} else if let Ok(filename) = WheelFilename::from_str(filename) {
|
||||
Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { filename, url }))
|
||||
} else {
|
||||
Self::Source(SourceDist::DirectUrl(DirectUrlSourceDist { name, url }))
|
||||
}
|
||||
|
@ -145,7 +153,7 @@ impl Dist {
|
|||
match self {
|
||||
Self::Built(built) => Self::Built(match built {
|
||||
BuiltDist::DirectUrl(dist) => BuiltDist::DirectUrl(DirectUrlBuiltDist {
|
||||
name: dist.name,
|
||||
filename: dist.filename,
|
||||
url,
|
||||
}),
|
||||
dist @ BuiltDist::Registry(_) => dist,
|
||||
|
@ -197,7 +205,7 @@ impl Metadata for RegistryBuiltDist {
|
|||
|
||||
impl Metadata for DirectUrlBuiltDist {
|
||||
fn name(&self) -> &PackageName {
|
||||
&self.name
|
||||
&self.filename.name
|
||||
}
|
||||
|
||||
fn version_or_url(&self) -> VersionOrUrl {
|
||||
|
|
|
@ -11,6 +11,9 @@ authors = { workspace = true }
|
|||
license = { workspace = true }
|
||||
|
||||
[dependencies]
|
||||
distribution-filename = { path = "../distribution-filename" }
|
||||
pypi-types = { path = "../pypi-types" }
|
||||
|
||||
clap = { workspace = true, features = ["derive"], optional = true }
|
||||
directories = { workspace = true }
|
||||
fs-err = { workspace = true }
|
||||
|
|
|
@ -11,6 +11,7 @@ mod cache_key;
|
|||
mod canonical_url;
|
||||
mod cli;
|
||||
mod digest;
|
||||
pub mod metadata;
|
||||
|
||||
/// A trait for types that can be hashed in a stable way across versions and platforms.
|
||||
pub trait StableHash {
|
||||
|
|
38
crates/puffin-cache/src/metadata.rs
Normal file
38
crates/puffin-cache/src/metadata.rs
Normal file
|
@ -0,0 +1,38 @@
|
|||
use std::path::{Path, PathBuf};
|
||||
|
||||
use url::Url;
|
||||
|
||||
use pypi_types::IndexUrl;
|
||||
|
||||
use crate::{digest, CanonicalUrl};
|
||||
|
||||
const WHEEL_METADATA_CACHE: &str = "wheel-metadata-v0";
|
||||
|
||||
/// Cache wheel metadata.
|
||||
///
|
||||
/// Wheel metadata can come from a remote wheel or from building a source
|
||||
/// distribution. For a remote wheel, we try the following ways to fetch the metadata:
|
||||
/// 1. From a [PEP 658](https://peps.python.org/pep-0658/) data-dist-info-metadata url
|
||||
/// 2. From a remote wheel by partial zip reading
|
||||
/// 3. From a (temp) download of a remote wheel (this is a fallback, the webserver should support range requests)
|
||||
pub enum WheelMetadataCache {
|
||||
Index(IndexUrl),
|
||||
Url,
|
||||
}
|
||||
|
||||
impl WheelMetadataCache {
|
||||
/// Cache structure:
|
||||
/// * `<wheel metadata cache>/pypi/foo-1.0.0-py3-none-any.json`
|
||||
/// * `<wheel metadata cache>/<digest(index-url)>/foo-1.0.0-py3-none-any.json`
|
||||
/// * `<wheel metadata cache>/url/<digest(url)>/foo-1.0.0-py3-none-any.json`
|
||||
pub fn cache_dir(&self, cache: &Path, url: &Url) -> PathBuf {
|
||||
let cache_root = cache.join(WHEEL_METADATA_CACHE);
|
||||
match self {
|
||||
WheelMetadataCache::Index(IndexUrl::Pypi) => cache_root.join("pypi"),
|
||||
WheelMetadataCache::Index(url) => cache_root
|
||||
.join("index")
|
||||
.join(digest(&CanonicalUrl::new(url))),
|
||||
WheelMetadataCache::Url => cache_root.join("url").join(digest(&CanonicalUrl::new(url))),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -6,6 +6,7 @@ edition = "2021"
|
|||
[dependencies]
|
||||
distribution-filename = { path = "../distribution-filename" }
|
||||
install-wheel-rs = { path = "../install-wheel-rs" }
|
||||
puffin-cache = { path = "../puffin-cache" }
|
||||
puffin-normalize = { path = "../puffin-normalize" }
|
||||
pypi-types = { path = "../pypi-types" }
|
||||
|
||||
|
@ -21,6 +22,7 @@ reqwest-middleware = { workspace = true }
|
|||
reqwest-retry = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
sha2 = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
|
|
|
@ -18,14 +18,13 @@ use url::Url;
|
|||
|
||||
use distribution_filename::WheelFilename;
|
||||
use install_wheel_rs::find_dist_info;
|
||||
use puffin_cache::metadata::WheelMetadataCache;
|
||||
use puffin_normalize::PackageName;
|
||||
use pypi_types::{File, IndexUrl, Metadata21, SimpleJson};
|
||||
|
||||
use crate::cached_client::CachedClient;
|
||||
use crate::error::Error;
|
||||
use crate::remote_metadata::{
|
||||
wheel_metadata_from_remote_zip, WHEEL_METADATA_FROM_INDEX, WHEEL_METADATA_FROM_ZIP_CACHE,
|
||||
};
|
||||
use crate::remote_metadata::wheel_metadata_from_remote_zip;
|
||||
|
||||
/// A builder for an [`RegistryClient`].
|
||||
#[derive(Debug, Clone)]
|
||||
|
@ -41,7 +40,7 @@ pub struct RegistryClientBuilder {
|
|||
impl RegistryClientBuilder {
|
||||
pub fn new(cache: impl Into<PathBuf>) -> Self {
|
||||
Self {
|
||||
index: IndexUrl::from(Url::parse("https://pypi.org/simple").unwrap()),
|
||||
index: IndexUrl::Pypi,
|
||||
extra_index: vec![],
|
||||
no_index: false,
|
||||
proxy: Url::parse("https://pypi-metadata.ruff.rs").unwrap(),
|
||||
|
@ -199,7 +198,7 @@ impl RegistryClient {
|
|||
}
|
||||
|
||||
/// Fetch the metadata from a wheel file.
|
||||
pub async fn wheel_metadata(&self, file: File) -> Result<Metadata21, Error> {
|
||||
pub async fn wheel_metadata(&self, index: IndexUrl, file: File) -> Result<Metadata21, Error> {
|
||||
if self.no_index {
|
||||
return Err(Error::NoIndex(file.filename));
|
||||
}
|
||||
|
@ -209,11 +208,12 @@ impl RegistryClient {
|
|||
let filename = WheelFilename::from_str(&file.filename)?;
|
||||
if file
|
||||
.dist_info_metadata
|
||||
.is_some_and(|dist_info_metadata| dist_info_metadata.is_available())
|
||||
.as_ref()
|
||||
.is_some_and(pypi_types::Metadata::is_available)
|
||||
{
|
||||
let url = Url::parse(&format!("{}.metadata", file.url))?;
|
||||
|
||||
let cache_dir = self.cache.join(WHEEL_METADATA_FROM_INDEX).join("pypi");
|
||||
let cache_dir = WheelMetadataCache::Index(index).cache_dir(&self.cache, &url);
|
||||
let cache_file = format!("{}.json", filename.stem());
|
||||
|
||||
let response_callback = |response: Response| async {
|
||||
|
@ -228,17 +228,23 @@ impl RegistryClient {
|
|||
// `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel
|
||||
// into the cache and read from there
|
||||
} else {
|
||||
self.wheel_metadata_no_index(&filename, &url).await
|
||||
self.wheel_metadata_no_pep658(&filename, &url, WheelMetadataCache::Index(index))
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the wheel metadata if it isn't available in an index through PEP 658
|
||||
pub async fn wheel_metadata_no_index(
|
||||
pub async fn wheel_metadata_no_pep658(
|
||||
&self,
|
||||
filename: &WheelFilename,
|
||||
url: &Url,
|
||||
cache_shard: WheelMetadataCache,
|
||||
) -> Result<Metadata21, Error> {
|
||||
let cache_dir = self.cache.join(WHEEL_METADATA_FROM_ZIP_CACHE).join("pypi");
|
||||
if self.no_index {
|
||||
return Err(Error::NoIndex(url.to_string()));
|
||||
}
|
||||
|
||||
let cache_dir = cache_shard.cache_dir(&self.cache, url);
|
||||
let cache_file = format!("{}.json", filename.stem());
|
||||
|
||||
// This response callback is special, we actually make a number of subsequent requests to
|
||||
|
|
|
@ -7,9 +7,6 @@ use install_wheel_rs::find_dist_info;
|
|||
|
||||
use crate::Error;
|
||||
|
||||
pub(crate) const WHEEL_METADATA_FROM_INDEX: &str = "wheel-metadat-index-v0";
|
||||
pub(crate) const WHEEL_METADATA_FROM_ZIP_CACHE: &str = "wheel-metadata-remote-v0";
|
||||
|
||||
/// Read the `.dist-info/METADATA` file from a async remote zip reader, so we avoid downloading the
|
||||
/// entire wheel just for the one file.
|
||||
///
|
||||
|
|
|
@ -5,6 +5,7 @@ use tempfile::tempdir;
|
|||
use url::Url;
|
||||
|
||||
use distribution_filename::WheelFilename;
|
||||
use puffin_cache::metadata::WheelMetadataCache;
|
||||
use puffin_client::RegistryClientBuilder;
|
||||
|
||||
#[tokio::test]
|
||||
|
@ -17,7 +18,11 @@ async fn remote_metadata_with_and_without_cache() -> Result<()> {
|
|||
let url = "https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl";
|
||||
let filename = WheelFilename::from_str(url.rsplit_once('/').unwrap().1).unwrap();
|
||||
let metadata = client
|
||||
.wheel_metadata_no_index(&filename, &Url::parse(url).unwrap())
|
||||
.wheel_metadata_no_pep658(
|
||||
&filename,
|
||||
&Url::parse(url).unwrap(),
|
||||
WheelMetadataCache::Url,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(metadata.summary.unwrap(), "Fast, Extensible Progress Meter");
|
||||
|
|
|
@ -5,6 +5,7 @@ use url::Url;
|
|||
|
||||
use anyhow::Result;
|
||||
use distribution_filename::WheelFilename;
|
||||
use puffin_cache::metadata::WheelMetadataCache;
|
||||
use puffin_cache::{CacheArgs, CacheDir};
|
||||
use puffin_client::RegistryClientBuilder;
|
||||
|
||||
|
@ -28,7 +29,9 @@ pub(crate) async fn wheel_metadata(args: WheelMetadataArgs) -> Result<()> {
|
|||
.1,
|
||||
)?;
|
||||
|
||||
let metadata = client.wheel_metadata_no_index(&filename, &args.url).await?;
|
||||
let metadata = client
|
||||
.wheel_metadata_no_pep658(&filename, &args.url, WheelMetadataCache::Url)
|
||||
.await?;
|
||||
println!("{metadata:?}");
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -72,7 +72,9 @@ impl<'a> Fetcher<'a> {
|
|||
match dist {
|
||||
// Fetch the metadata directly from the registry.
|
||||
Dist::Built(BuiltDist::Registry(wheel)) => {
|
||||
let metadata = client.wheel_metadata(wheel.file.clone()).await?;
|
||||
let metadata = client
|
||||
.wheel_metadata(wheel.index.clone(), wheel.file.clone())
|
||||
.await?;
|
||||
Ok(metadata)
|
||||
}
|
||||
// Fetch the distribution, then read the metadata (for built distributions), or build
|
||||
|
|
|
@ -157,7 +157,7 @@ impl Graph {
|
|||
marker: None,
|
||||
},
|
||||
Dist::Built(BuiltDist::DirectUrl(wheel)) => Requirement {
|
||||
name: wheel.name.clone(),
|
||||
name: wheel.filename.name.clone(),
|
||||
extras: None,
|
||||
version_or_url: Some(VersionOrUrl::Url(wheel.url.clone())),
|
||||
marker: None,
|
||||
|
|
|
@ -21,6 +21,7 @@ use distribution_filename::WheelFilename;
|
|||
use distribution_types::{BuiltDist, Dist, Identifier, Metadata, SourceDist, VersionOrUrl};
|
||||
use pep508_rs::{MarkerEnvironment, Requirement};
|
||||
use platform_tags::Tags;
|
||||
use puffin_cache::metadata::WheelMetadataCache;
|
||||
use puffin_cache::CanonicalUrl;
|
||||
use puffin_client::RegistryClient;
|
||||
use puffin_distribution::Fetcher;
|
||||
|
@ -300,8 +301,8 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> {
|
|||
PubGrubPackage::Package(package_name, _extra, Some(url)) => {
|
||||
// Emit a request to fetch the metadata for this distribution.
|
||||
if in_flight.insert_url(url) {
|
||||
priorities.add(package_name.clone());
|
||||
let distribution = Dist::from_url(package_name.clone(), url.clone());
|
||||
priorities.add(distribution.name().clone());
|
||||
request_sink.unbounded_send(Request::Dist(distribution))?;
|
||||
}
|
||||
}
|
||||
|
@ -605,26 +606,32 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> {
|
|||
.await
|
||||
}
|
||||
|
||||
// Fetch wheel metadata from the registry if possible. This is a fast-path to avoid
|
||||
// reading from the cache in the common case: we cache wheel metadata in the HTTP
|
||||
// cache, rather than downloading the wheel itself.
|
||||
Request::Dist(Dist::Built(BuiltDist::Registry(wheel))) => {
|
||||
let metadata = self
|
||||
.client
|
||||
.wheel_metadata(wheel.file.clone())
|
||||
.map_err(ResolveError::Client)
|
||||
.await?;
|
||||
if metadata.name != *wheel.name() {
|
||||
// Fetch wheel metadata.
|
||||
Request::Dist(Dist::Built(distribution)) => {
|
||||
let metadata = match &distribution {
|
||||
BuiltDist::Registry(wheel) => {
|
||||
self.client
|
||||
.wheel_metadata(wheel.index.clone(), wheel.file.clone())
|
||||
.await?
|
||||
}
|
||||
BuiltDist::DirectUrl(wheel) => {
|
||||
self.client
|
||||
.wheel_metadata_no_pep658(
|
||||
&wheel.filename,
|
||||
&wheel.url,
|
||||
WheelMetadataCache::Url,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
if metadata.name != *distribution.name() {
|
||||
return Err(ResolveError::NameMismatch {
|
||||
metadata: metadata.name,
|
||||
given: wheel.name().clone(),
|
||||
given: distribution.name().clone(),
|
||||
});
|
||||
}
|
||||
Ok(Response::Dist(
|
||||
Dist::Built(BuiltDist::Registry(wheel)),
|
||||
metadata,
|
||||
None,
|
||||
))
|
||||
Ok(Response::Dist(Dist::Built(distribution), metadata, None))
|
||||
}
|
||||
|
||||
// Fetch distribution metadata.
|
||||
|
|
|
@ -1,17 +1,38 @@
|
|||
use once_cell::sync::Lazy;
|
||||
use std::ops::Deref;
|
||||
use url::Url;
|
||||
|
||||
static PYPI_URL: Lazy<Url> = Lazy::new(|| Url::parse("https://pypi.org/simple").unwrap());
|
||||
|
||||
/// The url of an index, newtype'd to avoid mixing it with file urls
|
||||
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
||||
pub struct IndexUrl(Url);
|
||||
pub enum IndexUrl {
|
||||
Pypi,
|
||||
Url(Url),
|
||||
}
|
||||
|
||||
impl From<Url> for IndexUrl {
|
||||
fn from(url: Url) -> Self {
|
||||
Self(url)
|
||||
Self::Url(url)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<IndexUrl> for Url {
|
||||
fn from(index: IndexUrl) -> Self {
|
||||
index.0
|
||||
match index {
|
||||
IndexUrl::Pypi => PYPI_URL.clone(),
|
||||
IndexUrl::Url(url) => url,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for IndexUrl {
|
||||
type Target = Url;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match &self {
|
||||
IndexUrl::Pypi => &PYPI_URL,
|
||||
IndexUrl::Url(url) => url,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ pub use direct_url::{ArchiveInfo, DirectUrl, VcsInfo, VcsKind};
|
|||
pub use index_url::IndexUrl;
|
||||
pub use lenient_requirement::LenientVersionSpecifiers;
|
||||
pub use metadata::{Error, Metadata21};
|
||||
pub use simple_json::{File, SimpleJson, Yanked};
|
||||
pub use simple_json::{File, Metadata, SimpleJson, Yanked};
|
||||
|
||||
mod direct_url;
|
||||
mod index_url;
|
||||
|
|
5
scripts/benchmarks/requirements/all-kinds.in
Normal file
5
scripts/benchmarks/requirements/all-kinds.in
Normal file
|
@ -0,0 +1,5 @@
|
|||
# All kinds of dependencies
|
||||
flask @ https://files.pythonhosted.org/packages/36/42/015c23096649b908c809c69388a805a571a3bea44362fe87e33fc3afa01f/flask-3.0.0-py3-none-any.whl
|
||||
django_allauth==0.51.0
|
||||
pandas
|
||||
pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git
|
Loading…
Add table
Add a link
Reference in a new issue