Enforce extension validity at parse time (#5888)

## Summary

This PR adds a `DistExtension` field to some of our distribution types,
which requires that we validate that the file type is known and
supported when parsing (rather than when attempting to unzip). It
removes a bunch of extension parsing from the code too, in favor of
doing it once upfront.

Closes https://github.com/astral-sh/uv/issues/5858.
This commit is contained in:
Charlie Marsh 2024-08-08 21:39:47 -04:00 committed by GitHub
parent ba7c09edd0
commit 21408c1f35
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 803 additions and 480 deletions

View file

@ -13,14 +13,14 @@ use tracing::{debug, info_span, instrument, Instrument};
use url::Url;
use zip::ZipArchive;
use distribution_filename::WheelFilename;
use distribution_filename::{SourceDistExtension, WheelFilename};
use distribution_types::{
BuildableSource, DirectorySourceUrl, FileLocation, GitSourceUrl, HashPolicy, Hashed,
PathSourceUrl, RemoteSource, SourceDist, SourceUrl,
};
use install_wheel_rs::metadata::read_archive_metadata;
use platform_tags::Tags;
use pypi_types::{HashDigest, Metadata23, ParsedArchiveUrl};
use pypi_types::{HashDigest, Metadata23};
use uv_cache::{
ArchiveTimestamp, CacheBucket, CacheEntry, CacheShard, CachedByTimestamp, Timestamp, WheelCache,
};
@ -111,6 +111,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
&PathSourceUrl {
url: &url,
path: Cow::Borrowed(path),
ext: dist.ext,
},
&cache_shard,
tags,
@ -132,6 +133,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
&PathSourceUrl {
url: &url,
path: Cow::Owned(path),
ext: dist.ext,
},
&cache_shard,
tags,
@ -147,6 +149,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
&url,
&cache_shard,
None,
dist.ext,
tags,
hashes,
client,
@ -156,21 +159,20 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
}
BuildableSource::Dist(SourceDist::DirectUrl(dist)) => {
let filename = dist.filename().expect("Distribution must have a filename");
let ParsedArchiveUrl { url, subdirectory } =
ParsedArchiveUrl::from(dist.url.to_url());
// For direct URLs, cache directly under the hash of the URL itself.
let cache_shard = self.build_context.cache().shard(
CacheBucket::SourceDistributions,
WheelCache::Url(&url).root(),
WheelCache::Url(&dist.url).root(),
);
self.url(
source,
&filename,
&url,
&dist.url,
&cache_shard,
subdirectory.as_deref(),
dist.subdirectory.as_deref(),
dist.ext,
tags,
hashes,
client,
@ -208,21 +210,20 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
.url
.filename()
.expect("Distribution must have a filename");
let ParsedArchiveUrl { url, subdirectory } =
ParsedArchiveUrl::from(resource.url.clone());
// For direct URLs, cache directly under the hash of the URL itself.
let cache_shard = self.build_context.cache().shard(
CacheBucket::SourceDistributions,
WheelCache::Url(&url).root(),
WheelCache::Url(resource.url).root(),
);
self.url(
source,
&filename,
&url,
resource.url,
&cache_shard,
subdirectory.as_deref(),
resource.subdirectory,
resource.ext,
tags,
hashes,
client,
@ -287,6 +288,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
&PathSourceUrl {
url: &url,
path: Cow::Borrowed(path),
ext: dist.ext,
},
&cache_shard,
hashes,
@ -307,6 +309,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
&PathSourceUrl {
url: &url,
path: Cow::Owned(path),
ext: dist.ext,
},
&cache_shard,
hashes,
@ -321,6 +324,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
&url,
&cache_shard,
None,
dist.ext,
hashes,
client,
)
@ -329,21 +333,20 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
}
BuildableSource::Dist(SourceDist::DirectUrl(dist)) => {
let filename = dist.filename().expect("Distribution must have a filename");
let ParsedArchiveUrl { url, subdirectory } =
ParsedArchiveUrl::from(dist.url.to_url());
// For direct URLs, cache directly under the hash of the URL itself.
let cache_shard = self.build_context.cache().shard(
CacheBucket::SourceDistributions,
WheelCache::Url(&url).root(),
WheelCache::Url(&dist.url).root(),
);
self.url_metadata(
source,
&filename,
&url,
&dist.url,
&cache_shard,
subdirectory.as_deref(),
dist.subdirectory.as_deref(),
dist.ext,
hashes,
client,
)
@ -374,21 +377,20 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
.url
.filename()
.expect("Distribution must have a filename");
let ParsedArchiveUrl { url, subdirectory } =
ParsedArchiveUrl::from(resource.url.clone());
// For direct URLs, cache directly under the hash of the URL itself.
let cache_shard = self.build_context.cache().shard(
CacheBucket::SourceDistributions,
WheelCache::Url(&url).root(),
WheelCache::Url(resource.url).root(),
);
self.url_metadata(
source,
&filename,
&url,
resource.url,
&cache_shard,
subdirectory.as_deref(),
resource.subdirectory,
resource.ext,
hashes,
client,
)
@ -441,6 +443,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
url: &'data Url,
cache_shard: &CacheShard,
subdirectory: Option<&'data Path>,
ext: SourceDistExtension,
tags: &Tags,
hashes: HashPolicy<'_>,
client: &ManagedClient<'_>,
@ -449,7 +452,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
// Fetch the revision for the source distribution.
let revision = self
.url_revision(source, filename, url, cache_shard, hashes, client)
.url_revision(source, filename, ext, url, cache_shard, hashes, client)
.await?;
// Before running the build, check that the hashes match.
@ -512,6 +515,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
url: &'data Url,
cache_shard: &CacheShard,
subdirectory: Option<&'data Path>,
ext: SourceDistExtension,
hashes: HashPolicy<'_>,
client: &ManagedClient<'_>,
) -> Result<ArchiveMetadata, Error> {
@ -519,7 +523,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
// Fetch the revision for the source distribution.
let revision = self
.url_revision(source, filename, url, cache_shard, hashes, client)
.url_revision(source, filename, ext, url, cache_shard, hashes, client)
.await?;
// Before running the build, check that the hashes match.
@ -600,6 +604,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
&self,
source: &BuildableSource<'_>,
filename: &str,
ext: SourceDistExtension,
url: &Url,
cache_shard: &CacheShard,
hashes: HashPolicy<'_>,
@ -626,7 +631,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
debug!("Downloading source distribution: {source}");
let entry = cache_shard.shard(revision.id()).entry(filename);
let hashes = self
.download_archive(response, source, filename, entry.path(), hashes)
.download_archive(response, source, filename, ext, entry.path(), hashes)
.await?;
Ok(revision.with_hashes(hashes))
@ -859,7 +864,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
debug!("Unpacking source distribution: {source}");
let entry = cache_shard.shard(revision.id()).entry("source");
let hashes = self
.persist_archive(&resource.path, entry.path(), hashes)
.persist_archive(&resource.path, resource.ext, entry.path(), hashes)
.await?;
let revision = revision.with_hashes(hashes);
@ -1306,6 +1311,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
response: Response,
source: &BuildableSource<'_>,
filename: &str,
ext: SourceDistExtension,
target: &Path,
hashes: HashPolicy<'_>,
) -> Result<Vec<HashDigest>, Error> {
@ -1327,7 +1333,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
// Download and unzip the source distribution into a temporary directory.
let span = info_span!("download_source_dist", filename = filename, source_dist = %source);
uv_extract::stream::archive(&mut hasher, filename, temp_dir.path()).await?;
uv_extract::stream::archive(&mut hasher, ext, temp_dir.path()).await?;
drop(span);
// If necessary, exhaust the reader to compute the hash.
@ -1359,6 +1365,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
async fn persist_archive(
&self,
path: &Path,
ext: SourceDistExtension,
target: &Path,
hashes: HashPolicy<'_>,
) -> Result<Vec<HashDigest>, Error> {
@ -1380,7 +1387,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
let mut hasher = uv_extract::hash::HashReader::new(reader, &mut hashers);
// Unzip the archive into a temporary directory.
uv_extract::stream::archive(&mut hasher, path, &temp_dir.path()).await?;
uv_extract::stream::archive(&mut hasher, ext, &temp_dir.path()).await?;
// If necessary, exhaust the reader to compute the hash.
if !hashes.is_none() {