Wheel metadata refactor (#462)

A consistent cache structure for remote wheel metadata:

 * `<wheel metadata cache>/pypi/foo-1.0.0-py3-none-any.json`
* `<wheel metadata
cache>/<digest(index-url)>/foo-1.0.0-py3-none-any.json`
* `<wheel metadata cache>/url/<digest(url)>/foo-1.0.0-py3-none-any.json`

The source dist caching will use a similar structure (#468).
This commit is contained in:
konsti 2023-11-20 17:26:36 +01:00 committed by GitHub
parent d3e9e1783f
commit f0841cdb6e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 153 additions and 47 deletions

7
Cargo.lock generated
View file

@ -874,6 +874,7 @@ name = "distribution-types"
version = "0.0.1"
dependencies = [
"anyhow",
"distribution-filename",
"fs-err",
"pep440_rs 0.3.12",
"puffin-cache",
@ -2371,8 +2372,10 @@ version = "0.0.1"
dependencies = [
"clap",
"directories",
"distribution-filename",
"fs-err",
"hex",
"pypi-types",
"seahash",
"tempfile",
"url",
@ -2447,6 +2450,7 @@ dependencies = [
"http-cache-reqwest",
"http-cache-semantics",
"install-wheel-rs",
"puffin-cache",
"puffin-normalize",
"pypi-types",
"reqwest",
@ -2454,6 +2458,7 @@ dependencies = [
"reqwest-retry",
"serde",
"serde_json",
"sha2",
"tempfile",
"thiserror",
"tokio",
@ -2618,8 +2623,10 @@ dependencies = [
name = "puffin-macros"
version = "0.0.1"
dependencies = [
"colored",
"fxhash",
"once_cell",
"tracing",
]
[[package]]

View file

@ -10,6 +10,7 @@ authors = { workspace = true }
license = { workspace = true }
[dependencies]
distribution-filename = { path = "../distribution-filename" }
pep440_rs = { path = "../pep440-rs" }
puffin-cache = { path = "../puffin-cache" }
puffin-git = { path = "../puffin-git" }

View file

@ -80,7 +80,7 @@ impl CachedDist {
path,
}),
Dist::Built(BuiltDist::DirectUrl(dist)) => Self::Url(CachedDirectUrlDist {
name: dist.name,
name: dist.filename.name,
url: dist.url,
path,
}),

View file

@ -1,6 +1,8 @@
use std::path::Path;
use std::str::FromStr;
use anyhow::{Context, Result};
use distribution_filename::WheelFilename;
use url::Url;
use pep440_rs::Version;
@ -68,7 +70,9 @@ pub struct RegistryBuiltDist {
/// A built distribution (wheel) that exists at an arbitrary URL.
#[derive(Debug, Clone)]
pub struct DirectUrlBuiltDist {
pub name: PackageName,
/// We require that wheel urls end in the full wheel filename, e.g.
/// `https://example.org/packages/flask-3.0.0-py3-none-any.whl`
pub filename: WheelFilename,
pub url: Url,
}
@ -84,6 +88,8 @@ pub struct RegistrySourceDist {
/// A source distribution that exists at an arbitrary URL.
#[derive(Debug, Clone)]
pub struct DirectUrlSourceDist {
/// Unlike [`DirectUrlBuiltDist`], we can't require a full filename with a version here, people
/// like using e.g. `foo @ https://github.com/org/repo/archive/master.zip`
pub name: PackageName,
pub url: Url,
}
@ -120,13 +126,15 @@ impl Dist {
/// Create a [`Dist`] for a URL-based distribution.
pub fn from_url(name: PackageName, url: Url) -> Self {
// The part after the last slash
let filename = url
.path()
.rsplit_once('/')
.map_or(url.path(), |(_path, filename)| filename);
if url.scheme().starts_with("git+") {
Self::Source(SourceDist::Git(GitSourceDist { name, url }))
} else if Path::new(url.path())
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
{
Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { name, url }))
} else if let Ok(filename) = WheelFilename::from_str(filename) {
Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { filename, url }))
} else {
Self::Source(SourceDist::DirectUrl(DirectUrlSourceDist { name, url }))
}
@ -145,7 +153,7 @@ impl Dist {
match self {
Self::Built(built) => Self::Built(match built {
BuiltDist::DirectUrl(dist) => BuiltDist::DirectUrl(DirectUrlBuiltDist {
name: dist.name,
filename: dist.filename,
url,
}),
dist @ BuiltDist::Registry(_) => dist,
@ -197,7 +205,7 @@ impl Metadata for RegistryBuiltDist {
impl Metadata for DirectUrlBuiltDist {
fn name(&self) -> &PackageName {
&self.name
&self.filename.name
}
fn version_or_url(&self) -> VersionOrUrl {

View file

@ -11,6 +11,9 @@ authors = { workspace = true }
license = { workspace = true }
[dependencies]
distribution-filename = { path = "../distribution-filename" }
pypi-types = { path = "../pypi-types" }
clap = { workspace = true, features = ["derive"], optional = true }
directories = { workspace = true }
fs-err = { workspace = true }

View file

@ -11,6 +11,7 @@ mod cache_key;
mod canonical_url;
mod cli;
mod digest;
pub mod metadata;
/// A trait for types that can be hashed in a stable way across versions and platforms.
pub trait StableHash {

View file

@ -0,0 +1,38 @@
use std::path::{Path, PathBuf};
use url::Url;
use pypi_types::IndexUrl;
use crate::{digest, CanonicalUrl};
const WHEEL_METADATA_CACHE: &str = "wheel-metadata-v0";
/// Cache wheel metadata.
///
/// Wheel metadata can come from a remote wheel or from building a source
/// distribution. For a remote wheel, we try the following ways to fetch the metadata:
/// 1. From a [PEP 658](https://peps.python.org/pep-0658/) data-dist-info-metadata url
/// 2. From a remote wheel by partial zip reading
/// 3. From a (temp) download of a remote wheel (this is a fallback, the webserver should support range requests)
pub enum WheelMetadataCache {
Index(IndexUrl),
Url,
}
impl WheelMetadataCache {
/// Cache structure:
/// * `<wheel metadata cache>/pypi/foo-1.0.0-py3-none-any.json`
/// * `<wheel metadata cache>/<digest(index-url)>/foo-1.0.0-py3-none-any.json`
/// * `<wheel metadata cache>/url/<digest(url)>/foo-1.0.0-py3-none-any.json`
pub fn cache_dir(&self, cache: &Path, url: &Url) -> PathBuf {
let cache_root = cache.join(WHEEL_METADATA_CACHE);
match self {
WheelMetadataCache::Index(IndexUrl::Pypi) => cache_root.join("pypi"),
WheelMetadataCache::Index(url) => cache_root
.join("index")
.join(digest(&CanonicalUrl::new(url))),
WheelMetadataCache::Url => cache_root.join("url").join(digest(&CanonicalUrl::new(url))),
}
}
}

View file

@ -6,6 +6,7 @@ edition = "2021"
[dependencies]
distribution-filename = { path = "../distribution-filename" }
install-wheel-rs = { path = "../install-wheel-rs" }
puffin-cache = { path = "../puffin-cache" }
puffin-normalize = { path = "../puffin-normalize" }
pypi-types = { path = "../pypi-types" }
@ -21,6 +22,7 @@ reqwest-middleware = { workspace = true }
reqwest-retry = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
sha2 = { workspace = true }
thiserror = { workspace = true }
tempfile = { workspace = true }
tokio = { workspace = true, features = ["fs"] }

View file

@ -18,14 +18,13 @@ use url::Url;
use distribution_filename::WheelFilename;
use install_wheel_rs::find_dist_info;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_normalize::PackageName;
use pypi_types::{File, IndexUrl, Metadata21, SimpleJson};
use crate::cached_client::CachedClient;
use crate::error::Error;
use crate::remote_metadata::{
wheel_metadata_from_remote_zip, WHEEL_METADATA_FROM_INDEX, WHEEL_METADATA_FROM_ZIP_CACHE,
};
use crate::remote_metadata::wheel_metadata_from_remote_zip;
/// A builder for an [`RegistryClient`].
#[derive(Debug, Clone)]
@ -41,7 +40,7 @@ pub struct RegistryClientBuilder {
impl RegistryClientBuilder {
pub fn new(cache: impl Into<PathBuf>) -> Self {
Self {
index: IndexUrl::from(Url::parse("https://pypi.org/simple").unwrap()),
index: IndexUrl::Pypi,
extra_index: vec![],
no_index: false,
proxy: Url::parse("https://pypi-metadata.ruff.rs").unwrap(),
@ -199,7 +198,7 @@ impl RegistryClient {
}
/// Fetch the metadata from a wheel file.
pub async fn wheel_metadata(&self, file: File) -> Result<Metadata21, Error> {
pub async fn wheel_metadata(&self, index: IndexUrl, file: File) -> Result<Metadata21, Error> {
if self.no_index {
return Err(Error::NoIndex(file.filename));
}
@ -209,11 +208,12 @@ impl RegistryClient {
let filename = WheelFilename::from_str(&file.filename)?;
if file
.dist_info_metadata
.is_some_and(|dist_info_metadata| dist_info_metadata.is_available())
.as_ref()
.is_some_and(pypi_types::Metadata::is_available)
{
let url = Url::parse(&format!("{}.metadata", file.url))?;
let cache_dir = self.cache.join(WHEEL_METADATA_FROM_INDEX).join("pypi");
let cache_dir = WheelMetadataCache::Index(index).cache_dir(&self.cache, &url);
let cache_file = format!("{}.json", filename.stem());
let response_callback = |response: Response| async {
@ -228,17 +228,23 @@ impl RegistryClient {
// `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel
// into the cache and read from there
} else {
self.wheel_metadata_no_index(&filename, &url).await
self.wheel_metadata_no_pep658(&filename, &url, WheelMetadataCache::Index(index))
.await
}
}
/// Get the wheel metadata if it isn't available in an index through PEP 658
pub async fn wheel_metadata_no_index(
pub async fn wheel_metadata_no_pep658(
&self,
filename: &WheelFilename,
url: &Url,
cache_shard: WheelMetadataCache,
) -> Result<Metadata21, Error> {
let cache_dir = self.cache.join(WHEEL_METADATA_FROM_ZIP_CACHE).join("pypi");
if self.no_index {
return Err(Error::NoIndex(url.to_string()));
}
let cache_dir = cache_shard.cache_dir(&self.cache, url);
let cache_file = format!("{}.json", filename.stem());
// This response callback is special, we actually make a number of subsequent requests to

View file

@ -7,9 +7,6 @@ use install_wheel_rs::find_dist_info;
use crate::Error;
pub(crate) const WHEEL_METADATA_FROM_INDEX: &str = "wheel-metadat-index-v0";
pub(crate) const WHEEL_METADATA_FROM_ZIP_CACHE: &str = "wheel-metadata-remote-v0";
/// Read the `.dist-info/METADATA` file from a async remote zip reader, so we avoid downloading the
/// entire wheel just for the one file.
///

View file

@ -5,6 +5,7 @@ use tempfile::tempdir;
use url::Url;
use distribution_filename::WheelFilename;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_client::RegistryClientBuilder;
#[tokio::test]
@ -17,7 +18,11 @@ async fn remote_metadata_with_and_without_cache() -> Result<()> {
let url = "https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl";
let filename = WheelFilename::from_str(url.rsplit_once('/').unwrap().1).unwrap();
let metadata = client
.wheel_metadata_no_index(&filename, &Url::parse(url).unwrap())
.wheel_metadata_no_pep658(
&filename,
&Url::parse(url).unwrap(),
WheelMetadataCache::Url,
)
.await
.unwrap();
assert_eq!(metadata.summary.unwrap(), "Fast, Extensible Progress Meter");

View file

@ -5,6 +5,7 @@ use url::Url;
use anyhow::Result;
use distribution_filename::WheelFilename;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_cache::{CacheArgs, CacheDir};
use puffin_client::RegistryClientBuilder;
@ -28,7 +29,9 @@ pub(crate) async fn wheel_metadata(args: WheelMetadataArgs) -> Result<()> {
.1,
)?;
let metadata = client.wheel_metadata_no_index(&filename, &args.url).await?;
let metadata = client
.wheel_metadata_no_pep658(&filename, &args.url, WheelMetadataCache::Url)
.await?;
println!("{metadata:?}");
Ok(())
}

View file

@ -72,7 +72,9 @@ impl<'a> Fetcher<'a> {
match dist {
// Fetch the metadata directly from the registry.
Dist::Built(BuiltDist::Registry(wheel)) => {
let metadata = client.wheel_metadata(wheel.file.clone()).await?;
let metadata = client
.wheel_metadata(wheel.index.clone(), wheel.file.clone())
.await?;
Ok(metadata)
}
// Fetch the distribution, then read the metadata (for built distributions), or build

View file

@ -157,7 +157,7 @@ impl Graph {
marker: None,
},
Dist::Built(BuiltDist::DirectUrl(wheel)) => Requirement {
name: wheel.name.clone(),
name: wheel.filename.name.clone(),
extras: None,
version_or_url: Some(VersionOrUrl::Url(wheel.url.clone())),
marker: None,

View file

@ -21,6 +21,7 @@ use distribution_filename::WheelFilename;
use distribution_types::{BuiltDist, Dist, Identifier, Metadata, SourceDist, VersionOrUrl};
use pep508_rs::{MarkerEnvironment, Requirement};
use platform_tags::Tags;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_cache::CanonicalUrl;
use puffin_client::RegistryClient;
use puffin_distribution::Fetcher;
@ -300,8 +301,8 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> {
PubGrubPackage::Package(package_name, _extra, Some(url)) => {
// Emit a request to fetch the metadata for this distribution.
if in_flight.insert_url(url) {
priorities.add(package_name.clone());
let distribution = Dist::from_url(package_name.clone(), url.clone());
priorities.add(distribution.name().clone());
request_sink.unbounded_send(Request::Dist(distribution))?;
}
}
@ -605,26 +606,32 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> {
.await
}
// Fetch wheel metadata from the registry if possible. This is a fast-path to avoid
// reading from the cache in the common case: we cache wheel metadata in the HTTP
// cache, rather than downloading the wheel itself.
Request::Dist(Dist::Built(BuiltDist::Registry(wheel))) => {
let metadata = self
.client
.wheel_metadata(wheel.file.clone())
.map_err(ResolveError::Client)
.await?;
if metadata.name != *wheel.name() {
// Fetch wheel metadata.
Request::Dist(Dist::Built(distribution)) => {
let metadata = match &distribution {
BuiltDist::Registry(wheel) => {
self.client
.wheel_metadata(wheel.index.clone(), wheel.file.clone())
.await?
}
BuiltDist::DirectUrl(wheel) => {
self.client
.wheel_metadata_no_pep658(
&wheel.filename,
&wheel.url,
WheelMetadataCache::Url,
)
.await?
}
};
if metadata.name != *distribution.name() {
return Err(ResolveError::NameMismatch {
metadata: metadata.name,
given: wheel.name().clone(),
given: distribution.name().clone(),
});
}
Ok(Response::Dist(
Dist::Built(BuiltDist::Registry(wheel)),
metadata,
None,
))
Ok(Response::Dist(Dist::Built(distribution), metadata, None))
}
// Fetch distribution metadata.

View file

@ -1,17 +1,38 @@
use once_cell::sync::Lazy;
use std::ops::Deref;
use url::Url;
static PYPI_URL: Lazy<Url> = Lazy::new(|| Url::parse("https://pypi.org/simple").unwrap());
/// The url of an index, newtype'd to avoid mixing it with file urls
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
pub struct IndexUrl(Url);
pub enum IndexUrl {
Pypi,
Url(Url),
}
impl From<Url> for IndexUrl {
fn from(url: Url) -> Self {
Self(url)
Self::Url(url)
}
}
impl From<IndexUrl> for Url {
fn from(index: IndexUrl) -> Self {
index.0
match index {
IndexUrl::Pypi => PYPI_URL.clone(),
IndexUrl::Url(url) => url,
}
}
}
impl Deref for IndexUrl {
type Target = Url;
fn deref(&self) -> &Self::Target {
match &self {
IndexUrl::Pypi => &PYPI_URL,
IndexUrl::Url(url) => url,
}
}
}

View file

@ -2,7 +2,7 @@ pub use direct_url::{ArchiveInfo, DirectUrl, VcsInfo, VcsKind};
pub use index_url::IndexUrl;
pub use lenient_requirement::LenientVersionSpecifiers;
pub use metadata::{Error, Metadata21};
pub use simple_json::{File, SimpleJson, Yanked};
pub use simple_json::{File, Metadata, SimpleJson, Yanked};
mod direct_url;
mod index_url;

View file

@ -0,0 +1,5 @@
# All kinds of dependencies
flask @ https://files.pythonhosted.org/packages/36/42/015c23096649b908c809c69388a805a571a3bea44362fe87e33fc3afa01f/flask-3.0.0-py3-none-any.whl
django_allauth==0.51.0
pandas
pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git