Use absolute urls in distribution_type::File (#917)

Previously, the url on file could either be a relative or an absolute
url, depending on the index, and we would finalize it lazily. Now we
finalize the url when converting `pypi_types::File` to
`distribution_types::File`. This change is required to make the hashes
on `File` optional (https://github.com/astral-sh/puffin/pull/910), which
are currently the only unique field usable for caching.
This commit is contained in:
konsti 2024-01-14 18:15:24 +01:00 committed by GitHub
parent 6e18e56789
commit a99e5e00f2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 41 additions and 41 deletions

View file

@ -1,8 +1,19 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use thiserror::Error;
use url::Url;
use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError}; use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError};
use pypi_types::{DistInfoMetadata, Hashes, Yanked}; use pypi_types::{BaseUrl, DistInfoMetadata, Hashes, Yanked};
/// Error converting [`pypi_types::File`] to [`distribution_type::File`].
#[derive(Debug, Error)]
pub enum FileConversionError {
#[error("Invalid 'requires-python' value")]
VersionSpecifiersParseError(#[from] VersionSpecifiersParseError),
#[error("Failed to parse URL: {0}")]
Url(String, #[source] url::ParseError),
}
/// Internal analog to [`pypi_types::File`]. /// Internal analog to [`pypi_types::File`].
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -13,15 +24,13 @@ pub struct File {
pub requires_python: Option<VersionSpecifiers>, pub requires_python: Option<VersionSpecifiers>,
pub size: Option<u64>, pub size: Option<u64>,
pub upload_time: Option<DateTime<Utc>>, pub upload_time: Option<DateTime<Utc>>,
pub url: String, pub url: Url,
pub yanked: Option<Yanked>, pub yanked: Option<Yanked>,
} }
impl TryFrom<pypi_types::File> for File { impl File {
type Error = VersionSpecifiersParseError;
/// `TryFrom` instead of `From` to filter out files with invalid requires python version specifiers /// `TryFrom` instead of `From` to filter out files with invalid requires python version specifiers
fn try_from(file: pypi_types::File) -> Result<Self, Self::Error> { pub fn try_from(file: pypi_types::File, base: &BaseUrl) -> Result<Self, FileConversionError> {
Ok(Self { Ok(Self {
dist_info_metadata: file.dist_info_metadata, dist_info_metadata: file.dist_info_metadata,
filename: file.filename, filename: file.filename,
@ -29,7 +38,9 @@ impl TryFrom<pypi_types::File> for File {
requires_python: file.requires_python.transpose()?, requires_python: file.requires_python.transpose()?,
size: file.size, size: file.size,
upload_time: file.upload_time, upload_time: file.upload_time,
url: file.url, url: base
.join_relative(&file.url)
.map_err(|err| FileConversionError::Url(file.url.clone(), err))?,
yanked: file.yanked, yanked: file.yanked,
}) })
} }

View file

@ -171,15 +171,16 @@ impl RegistryClient {
let bytes = response.bytes().await?; let bytes = response.bytes().await?;
let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) let data: SimpleJson = serde_json::from_slice(bytes.as_ref())
.map_err(|err| Error::from_json_err(err, url.clone()))?; .map_err(|err| Error::from_json_err(err, url.clone()))?;
let metadata = SimpleMetadata::from_files(data.files, package_name);
let base = BaseUrl::from(url.clone()); let base = BaseUrl::from(url.clone());
let metadata =
SimpleMetadata::from_files(data.files, package_name, &base);
Ok((base, metadata)) Ok((base, metadata))
} }
MediaType::Html => { MediaType::Html => {
let text = response.text().await?; let text = response.text().await?;
let SimpleHtml { base, files } = SimpleHtml::parse(&text, &url) let SimpleHtml { base, files } = SimpleHtml::parse(&text, &url)
.map_err(|err| Error::from_html_err(err, url.clone()))?; .map_err(|err| Error::from_html_err(err, url.clone()))?;
let metadata = SimpleMetadata::from_files(files, package_name); let metadata = SimpleMetadata::from_files(files, package_name, &base);
Ok((base, metadata)) Ok((base, metadata))
} }
} }
@ -217,7 +218,7 @@ impl RegistryClient {
pub async fn wheel_metadata(&self, built_dist: &BuiltDist) -> Result<Metadata21, Error> { pub async fn wheel_metadata(&self, built_dist: &BuiltDist) -> Result<Metadata21, Error> {
let metadata = match &built_dist { let metadata = match &built_dist {
BuiltDist::Registry(wheel) => { BuiltDist::Registry(wheel) => {
self.wheel_metadata_registry(&wheel.index, &wheel.base, &wheel.file) self.wheel_metadata_registry(&wheel.index, &wheel.file)
.await? .await?
} }
BuiltDist::DirectUrl(wheel) => { BuiltDist::DirectUrl(wheel) => {
@ -248,7 +249,6 @@ impl RegistryClient {
async fn wheel_metadata_registry( async fn wheel_metadata_registry(
&self, &self,
index: &IndexUrl, index: &IndexUrl,
base: &BaseUrl,
file: &File, file: &File,
) -> Result<Metadata21, Error> { ) -> Result<Metadata21, Error> {
if self.index_urls.no_index() { if self.index_urls.no_index() {
@ -256,14 +256,13 @@ impl RegistryClient {
} }
// If the metadata file is available at its own url (PEP 658), download it from there. // If the metadata file is available at its own url (PEP 658), download it from there.
let url = base.join_relative(&file.url)?;
let filename = WheelFilename::from_str(&file.filename)?; let filename = WheelFilename::from_str(&file.filename)?;
if file if file
.dist_info_metadata .dist_info_metadata
.as_ref() .as_ref()
.is_some_and(pypi_types::DistInfoMetadata::is_available) .is_some_and(pypi_types::DistInfoMetadata::is_available)
{ {
let url = Url::parse(&format!("{url}.metadata"))?; let url = Url::parse(&format!("{}.metadata", file.url))?;
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
@ -287,7 +286,7 @@ impl RegistryClient {
// If we lack PEP 658 support, try using HTTP range requests to read only the // If we lack PEP 658 support, try using HTTP range requests to read only the
// `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel // `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel
// into the cache and read from there // into the cache and read from there
self.wheel_metadata_no_pep658(&filename, &url, WheelCache::Index(index)) self.wheel_metadata_no_pep658(&filename, &file.url, WheelCache::Index(index))
.await .await
} }
} }
@ -452,7 +451,11 @@ impl SimpleMetadata {
self.0.iter() self.0.iter()
} }
fn from_files(files: Vec<pypi_types::File>, package_name: &PackageName) -> Self { fn from_files(
files: Vec<pypi_types::File>,
package_name: &PackageName,
base: &BaseUrl,
) -> Self {
let mut metadata = Self::default(); let mut metadata = Self::default();
// Group the distributions by version and kind // Group the distributions by version and kind
@ -465,7 +468,7 @@ impl SimpleMetadata {
DistFilename::WheelFilename(ref inner) => &inner.version, DistFilename::WheelFilename(ref inner) => &inner.version,
}; };
let file = match file.try_into() { let file = match File::try_from(file, base) {
Ok(file) => file, Ok(file) => file,
Err(err) => { Err(err) => {
// Ignore files with unparseable version specifiers. // Ignore files with unparseable version specifiers.
@ -526,9 +529,10 @@ impl MediaType {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::str::FromStr; use std::str::FromStr;
use url::Url;
use puffin_normalize::PackageName; use puffin_normalize::PackageName;
use pypi_types::SimpleJson; use pypi_types::{BaseUrl, SimpleJson};
use crate::SimpleMetadata; use crate::SimpleMetadata;
@ -568,8 +572,12 @@ mod tests {
} }
"#; "#;
let data: SimpleJson = serde_json::from_str(response).unwrap(); let data: SimpleJson = serde_json::from_str(response).unwrap();
let simple_metadata = let base = BaseUrl::from(Url::from_str("https://pypi.org/simple/pyflyby/").unwrap());
SimpleMetadata::from_files(data.files, &PackageName::from_str("pyflyby").unwrap()); let simple_metadata = SimpleMetadata::from_files(
data.files,
&PackageName::from_str("pyflyby").unwrap(),
&base,
);
let versions: Vec<String> = simple_metadata let versions: Vec<String> = simple_metadata
.iter() .iter()
.map(|(version, _)| version.to_string()) .map(|(version, _)| version.to_string())

View file

@ -108,11 +108,6 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
) -> Result<LocalWheel, DistributionDatabaseError> { ) -> Result<LocalWheel, DistributionDatabaseError> {
match &dist { match &dist {
Dist::Built(BuiltDist::Registry(wheel)) => { Dist::Built(BuiltDist::Registry(wheel)) => {
let url = wheel
.base
.join_relative(&wheel.file.url)
.map_err(|err| DistributionDatabaseError::Url(wheel.file.url.clone(), err))?;
// Download and unzip on the same tokio task. // Download and unzip on the same tokio task.
// //
// In all wheels we've seen so far, unzipping while downloading is // In all wheels we've seen so far, unzipping while downloading is
@ -128,7 +123,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
// for downloading and unzipping (with a buffer in between) and switch // for downloading and unzipping (with a buffer in between) and switch
// to rayon if this buffer grows large by the time the file is fully // to rayon if this buffer grows large by the time the file is fully
// downloaded. // downloaded.
let reader = self.client.stream_external(&url).await?; let reader = self.client.stream_external(&wheel.file.url).await?;
// Download and unzip the wheel to a temporary directory. // Download and unzip the wheel to a temporary directory.
let temp_dir = tempfile::tempdir_in(self.cache.root())?; let temp_dir = tempfile::tempdir_in(self.cache.root())?;

View file

@ -98,13 +98,6 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
.await? .await?
} }
SourceDist::Registry(registry_source_dist) => { SourceDist::Registry(registry_source_dist) => {
let url = registry_source_dist
.base
.join_relative(&registry_source_dist.file.url)
.map_err(|err| {
SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err)
})?;
// For registry source distributions, shard by package, then by SHA. // For registry source distributions, shard by package, then by SHA.
// Ex) `pypi/requests/a673187abc19fe6c` // Ex) `pypi/requests/a673187abc19fe6c`
let cache_shard = self.build_context.cache().shard( let cache_shard = self.build_context.cache().shard(
@ -117,7 +110,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
self.url( self.url(
source_dist, source_dist,
&registry_source_dist.file.filename, &registry_source_dist.file.filename,
&url, &registry_source_dist.file.url,
&cache_shard, &cache_shard,
None, None,
) )
@ -161,13 +154,6 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
.await? .await?
} }
SourceDist::Registry(registry_source_dist) => { SourceDist::Registry(registry_source_dist) => {
let url = registry_source_dist
.base
.join_relative(&registry_source_dist.file.url)
.map_err(|err| {
SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err)
})?;
// For registry source distributions, shard by package, then by SHA. // For registry source distributions, shard by package, then by SHA.
// Ex) `pypi/requests/a673187abc19fe6c` // Ex) `pypi/requests/a673187abc19fe6c`
let cache_shard = self.build_context.cache().shard( let cache_shard = self.build_context.cache().shard(
@ -180,7 +166,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
self.url_metadata( self.url_metadata(
source_dist, source_dist,
&registry_source_dist.file.filename, &registry_source_dist.file.filename,
&url, &registry_source_dist.file.url,
&cache_shard, &cache_shard,
None, None,
) )