Use absolute urls in distribution_type::File (#917)

Previously, the url on file could either be a relative or an absolute
url, depending on the index, and we would finalize it lazily. Now we
finalize the url when converting `pypi_types::File` to
`distribution_types::File`. This change is required to make the hashes
on `File` optional (https://github.com/astral-sh/puffin/pull/910), which
are currently the only unique field usable for caching.
This commit is contained in:
konsti 2024-01-14 18:15:24 +01:00 committed by GitHub
parent 6e18e56789
commit a99e5e00f2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 41 additions and 41 deletions

View file

@ -1,8 +1,19 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use url::Url;
use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError};
use pypi_types::{DistInfoMetadata, Hashes, Yanked};
use pypi_types::{BaseUrl, DistInfoMetadata, Hashes, Yanked};
/// Error converting [`pypi_types::File`] to [`distribution_type::File`].
#[derive(Debug, Error)]
pub enum FileConversionError {
#[error("Invalid 'requires-python' value")]
VersionSpecifiersParseError(#[from] VersionSpecifiersParseError),
#[error("Failed to parse URL: {0}")]
Url(String, #[source] url::ParseError),
}
/// Internal analog to [`pypi_types::File`].
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -13,15 +24,13 @@ pub struct File {
pub requires_python: Option<VersionSpecifiers>,
pub size: Option<u64>,
pub upload_time: Option<DateTime<Utc>>,
pub url: String,
pub url: Url,
pub yanked: Option<Yanked>,
}
impl TryFrom<pypi_types::File> for File {
type Error = VersionSpecifiersParseError;
impl File {
/// `TryFrom` instead of `From` to filter out files with invalid requires python version specifiers
fn try_from(file: pypi_types::File) -> Result<Self, Self::Error> {
pub fn try_from(file: pypi_types::File, base: &BaseUrl) -> Result<Self, FileConversionError> {
Ok(Self {
dist_info_metadata: file.dist_info_metadata,
filename: file.filename,
@ -29,7 +38,9 @@ impl TryFrom<pypi_types::File> for File {
requires_python: file.requires_python.transpose()?,
size: file.size,
upload_time: file.upload_time,
url: file.url,
url: base
.join_relative(&file.url)
.map_err(|err| FileConversionError::Url(file.url.clone(), err))?,
yanked: file.yanked,
})
}

View file

@ -171,15 +171,16 @@ impl RegistryClient {
let bytes = response.bytes().await?;
let data: SimpleJson = serde_json::from_slice(bytes.as_ref())
.map_err(|err| Error::from_json_err(err, url.clone()))?;
let metadata = SimpleMetadata::from_files(data.files, package_name);
let base = BaseUrl::from(url.clone());
let metadata =
SimpleMetadata::from_files(data.files, package_name, &base);
Ok((base, metadata))
}
MediaType::Html => {
let text = response.text().await?;
let SimpleHtml { base, files } = SimpleHtml::parse(&text, &url)
.map_err(|err| Error::from_html_err(err, url.clone()))?;
let metadata = SimpleMetadata::from_files(files, package_name);
let metadata = SimpleMetadata::from_files(files, package_name, &base);
Ok((base, metadata))
}
}
@ -217,7 +218,7 @@ impl RegistryClient {
pub async fn wheel_metadata(&self, built_dist: &BuiltDist) -> Result<Metadata21, Error> {
let metadata = match &built_dist {
BuiltDist::Registry(wheel) => {
self.wheel_metadata_registry(&wheel.index, &wheel.base, &wheel.file)
self.wheel_metadata_registry(&wheel.index, &wheel.file)
.await?
}
BuiltDist::DirectUrl(wheel) => {
@ -248,7 +249,6 @@ impl RegistryClient {
async fn wheel_metadata_registry(
&self,
index: &IndexUrl,
base: &BaseUrl,
file: &File,
) -> Result<Metadata21, Error> {
if self.index_urls.no_index() {
@ -256,14 +256,13 @@ impl RegistryClient {
}
// If the metadata file is available at its own url (PEP 658), download it from there.
let url = base.join_relative(&file.url)?;
let filename = WheelFilename::from_str(&file.filename)?;
if file
.dist_info_metadata
.as_ref()
.is_some_and(pypi_types::DistInfoMetadata::is_available)
{
let url = Url::parse(&format!("{url}.metadata"))?;
let url = Url::parse(&format!("{}.metadata", file.url))?;
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
@ -287,7 +286,7 @@ impl RegistryClient {
// If we lack PEP 658 support, try using HTTP range requests to read only the
// `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel
// into the cache and read from there
self.wheel_metadata_no_pep658(&filename, &url, WheelCache::Index(index))
self.wheel_metadata_no_pep658(&filename, &file.url, WheelCache::Index(index))
.await
}
}
@ -452,7 +451,11 @@ impl SimpleMetadata {
self.0.iter()
}
fn from_files(files: Vec<pypi_types::File>, package_name: &PackageName) -> Self {
fn from_files(
files: Vec<pypi_types::File>,
package_name: &PackageName,
base: &BaseUrl,
) -> Self {
let mut metadata = Self::default();
// Group the distributions by version and kind
@ -465,7 +468,7 @@ impl SimpleMetadata {
DistFilename::WheelFilename(ref inner) => &inner.version,
};
let file = match file.try_into() {
let file = match File::try_from(file, base) {
Ok(file) => file,
Err(err) => {
// Ignore files with unparseable version specifiers.
@ -526,9 +529,10 @@ impl MediaType {
#[cfg(test)]
mod tests {
use std::str::FromStr;
use url::Url;
use puffin_normalize::PackageName;
use pypi_types::SimpleJson;
use pypi_types::{BaseUrl, SimpleJson};
use crate::SimpleMetadata;
@ -568,8 +572,12 @@ mod tests {
}
"#;
let data: SimpleJson = serde_json::from_str(response).unwrap();
let simple_metadata =
SimpleMetadata::from_files(data.files, &PackageName::from_str("pyflyby").unwrap());
let base = BaseUrl::from(Url::from_str("https://pypi.org/simple/pyflyby/").unwrap());
let simple_metadata = SimpleMetadata::from_files(
data.files,
&PackageName::from_str("pyflyby").unwrap(),
&base,
);
let versions: Vec<String> = simple_metadata
.iter()
.map(|(version, _)| version.to_string())

View file

@ -108,11 +108,6 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
) -> Result<LocalWheel, DistributionDatabaseError> {
match &dist {
Dist::Built(BuiltDist::Registry(wheel)) => {
let url = wheel
.base
.join_relative(&wheel.file.url)
.map_err(|err| DistributionDatabaseError::Url(wheel.file.url.clone(), err))?;
// Download and unzip on the same tokio task.
//
// In all wheels we've seen so far, unzipping while downloading is
@ -128,7 +123,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
// for downloading and unzipping (with a buffer in between) and switch
// to rayon if this buffer grows large by the time the file is fully
// downloaded.
let reader = self.client.stream_external(&url).await?;
let reader = self.client.stream_external(&wheel.file.url).await?;
// Download and unzip the wheel to a temporary directory.
let temp_dir = tempfile::tempdir_in(self.cache.root())?;

View file

@ -98,13 +98,6 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
.await?
}
SourceDist::Registry(registry_source_dist) => {
let url = registry_source_dist
.base
.join_relative(&registry_source_dist.file.url)
.map_err(|err| {
SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err)
})?;
// For registry source distributions, shard by package, then by SHA.
// Ex) `pypi/requests/a673187abc19fe6c`
let cache_shard = self.build_context.cache().shard(
@ -117,7 +110,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
self.url(
source_dist,
&registry_source_dist.file.filename,
&url,
&registry_source_dist.file.url,
&cache_shard,
None,
)
@ -161,13 +154,6 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
.await?
}
SourceDist::Registry(registry_source_dist) => {
let url = registry_source_dist
.base
.join_relative(&registry_source_dist.file.url)
.map_err(|err| {
SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err)
})?;
// For registry source distributions, shard by package, then by SHA.
// Ex) `pypi/requests/a673187abc19fe6c`
let cache_shard = self.build_context.cache().shard(
@ -180,7 +166,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
self.url_metadata(
source_dist,
&registry_source_dist.file.filename,
&url,
&registry_source_dist.file.url,
&cache_shard,
None,
)