Heal cache entries with missing source distributions (#7559)

## Summary

`uv cache prune --ci` will remove the source distribution directory. If
we then need to build a _different_ wheel (e.g., you're building a
package that has Python minor version-specific wheels), we fail, because
we expect the source to be there.

Now, if the source is missing, we re-download it. It would be slightly
easier to just _ignore_ that revision, but that would mean we'd also
lose the already-built wheels -- so if you ran against many Python
versions, we'd continuously lose the cached data.

Closes https://github.com/astral-sh/uv/issues/7543.

## Test Plan

We can add tests, but they _need_ to build non-pure Python wheels, which
tends to be expensive...

For reference:

```console
$ cargo run venv --python 3.12
$ cargo run pip install mercurial==6.8.1 --verbose
$ cargo run cache prune --ci
$ cargo run venv --python 3.11
$ cargo run pip install mercurial==6.8.1 --verbose
```

I also did this with a local `.tar.gz` that I downloaded from PyPI.
This commit is contained in:
Charlie Marsh 2024-09-19 16:31:55 -04:00 committed by GitHub
parent 5de6d2338d
commit f3463b3d08
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 138 additions and 10 deletions

1
Cargo.lock generated
View file

@ -4795,6 +4795,7 @@ dependencies = [
"indoc", "indoc",
"insta", "insta",
"nanoid", "nanoid",
"owo-colors",
"pep440_rs", "pep440_rs",
"pep508_rs", "pep508_rs",
"platform-tags", "platform-tags",

View file

@ -45,6 +45,11 @@ impl CacheEntry {
Self(path.into()) Self(path.into())
} }
/// Return the cache entry's parent directory.
pub fn shard(&self) -> CacheShard {
CacheShard(self.dir().to_path_buf())
}
/// Convert the [`CacheEntry`] into a [`PathBuf`]. /// Convert the [`CacheEntry`] into a [`PathBuf`].
#[inline] #[inline]
pub fn into_path_buf(self) -> PathBuf { pub fn into_path_buf(self) -> PathBuf {

View file

@ -37,6 +37,7 @@ anyhow = { workspace = true }
fs-err = { workspace = true } fs-err = { workspace = true }
futures = { workspace = true } futures = { workspace = true }
nanoid = { workspace = true } nanoid = { workspace = true }
owo-colors = { workspace = true }
reqwest = { workspace = true } reqwest = { workspace = true }
reqwest-middleware = { workspace = true } reqwest-middleware = { workspace = true }
rmp-serde = { workspace = true } rmp-serde = { workspace = true }

View file

@ -1,5 +1,6 @@
use std::path::PathBuf; use std::path::PathBuf;
use owo_colors::OwoColorize;
use tokio::task::JoinError; use tokio::task::JoinError;
use url::Url; use url::Url;
use zip::result::ZipError; use zip::result::ZipError;
@ -93,6 +94,8 @@ pub enum Error {
MetadataLowering(#[from] MetadataError), MetadataLowering(#[from] MetadataError),
#[error("Distribution not found at: {0}")] #[error("Distribution not found at: {0}")]
NotFound(Url), NotFound(Url),
#[error("Attempted to re-extract the source distribution for `{0}`, but the hashes didn't match. Run `{}` to clear the cache.", "uv cache clean".green())]
CacheHeal(String),
/// A generic request middleware error happened while making a request. /// A generic request middleware error happened while making a request.
/// Refer to the error message for more details. /// Refer to the error message for more details.

View file

@ -23,7 +23,7 @@ use platform_tags::Tags;
use pypi_types::{HashDigest, Metadata12, Metadata23, RequiresTxt}; use pypi_types::{HashDigest, Metadata12, Metadata23, RequiresTxt};
use reqwest::Response; use reqwest::Response;
use tokio_util::compat::FuturesAsyncReadCompatExt; use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{debug, info_span, instrument, Instrument}; use tracing::{debug, info_span, instrument, warn, Instrument};
use url::Url; use url::Url;
use uv_cache::{Cache, CacheBucket, CacheEntry, CacheShard, Removal, WheelCache}; use uv_cache::{Cache, CacheBucket, CacheEntry, CacheShard, Removal, WheelCache};
use uv_cache_info::CacheInfo; use uv_cache_info::CacheInfo;
@ -425,6 +425,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
// Scope all operations to the revision. Within the revision, there's no need to check for // Scope all operations to the revision. Within the revision, there's no need to check for
// freshness, since entries have to be fresher than the revision itself. // freshness, since entries have to be fresher than the revision itself.
let cache_shard = cache_shard.shard(revision.id()); let cache_shard = cache_shard.shard(revision.id());
let source_dist_entry = cache_shard.entry(filename);
// If there are build settings, we need to scope to a cache shard. // If there are build settings, we need to scope to a cache shard.
let config_settings = self.build_context.config_settings(); let config_settings = self.build_context.config_settings();
@ -439,13 +440,29 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
return Ok(built_wheel.with_hashes(revision.into_hashes())); return Ok(built_wheel.with_hashes(revision.into_hashes()));
} }
// Otherwise, we need to build a wheel. Before building, ensure that the source is present.
let revision = if source_dist_entry.path().is_dir() {
revision
} else {
self.heal_url_revision(
source,
filename,
ext,
url,
&source_dist_entry,
revision,
hashes,
client,
)
.await?
};
let task = self let task = self
.reporter .reporter
.as_ref() .as_ref()
.map(|reporter| reporter.on_build_start(source)); .map(|reporter| reporter.on_build_start(source));
// Build the source distribution. // Build the source distribution.
let source_dist_entry = cache_shard.entry(filename);
let (disk_filename, wheel_filename, metadata) = self let (disk_filename, wheel_filename, metadata) = self
.build_distribution(source, source_dist_entry.path(), subdirectory, &cache_shard) .build_distribution(source, source_dist_entry.path(), subdirectory, &cache_shard)
.await?; .await?;
@ -527,6 +544,23 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
}); });
} }
// Otherwise, we need a wheel.
let revision = if source_dist_entry.path().is_dir() {
revision
} else {
self.heal_url_revision(
source,
filename,
ext,
url,
&source_dist_entry,
revision,
hashes,
client,
)
.await?
};
// If there are build settings, we need to scope to a cache shard. // If there are build settings, we need to scope to a cache shard.
let config_settings = self.build_context.config_settings(); let config_settings = self.build_context.config_settings();
let cache_shard = if config_settings.is_empty() { let cache_shard = if config_settings.is_empty() {
@ -686,6 +720,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
// Scope all operations to the revision. Within the revision, there's no need to check for // Scope all operations to the revision. Within the revision, there's no need to check for
// freshness, since entries have to be fresher than the revision itself. // freshness, since entries have to be fresher than the revision itself.
let cache_shard = cache_shard.shard(revision.id()); let cache_shard = cache_shard.shard(revision.id());
let source_entry = cache_shard.entry("source");
// If there are build settings, we need to scope to a cache shard. // If there are build settings, we need to scope to a cache shard.
let config_settings = self.build_context.config_settings(); let config_settings = self.build_context.config_settings();
@ -700,9 +735,14 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
return Ok(built_wheel); return Ok(built_wheel);
} }
let source_entry = cache_shard.entry("source"); // Otherwise, we need to build a wheel, which requires a source distribution.
let revision = if source_entry.path().is_dir() {
revision
} else {
self.heal_archive_revision(source, resource, &source_entry, revision, hashes)
.await?
};
// Otherwise, we need to build a wheel.
let task = self let task = self
.reporter .reporter
.as_ref() .as_ref()
@ -785,6 +825,14 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
}); });
} }
// Otherwise, we need a source distribution.
let revision = if source_entry.path().is_dir() {
revision
} else {
self.heal_archive_revision(source, resource, &source_entry, revision, hashes)
.await?
};
// If the backend supports `prepare_metadata_for_build_wheel`, use it. // If the backend supports `prepare_metadata_for_build_wheel`, use it.
if let Some(metadata) = self if let Some(metadata) = self
.build_metadata(source, source_entry.path(), None) .build_metadata(source, source_entry.path(), None)
@ -1346,6 +1394,66 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
)) ))
} }
/// Heal a [`Revision`] for a local archive.
async fn heal_archive_revision(
&self,
source: &BuildableSource<'_>,
resource: &PathSourceUrl<'_>,
entry: &CacheEntry,
revision: Revision,
hashes: HashPolicy<'_>,
) -> Result<Revision, Error> {
warn!("Re-extracting missing source distribution: {source}");
let hashes = self
.persist_archive(&resource.path, resource.ext, entry.path(), hashes)
.await?;
if hashes != revision.hashes() {
return Err(Error::CacheHeal(source.to_string()));
}
Ok(revision.with_hashes(hashes))
}
/// Heal a [`Revision`] for a remote archive.
async fn heal_url_revision(
&self,
source: &BuildableSource<'_>,
filename: &str,
ext: SourceDistExtension,
url: &Url,
entry: &CacheEntry,
revision: Revision,
hashes: HashPolicy<'_>,
client: &ManagedClient<'_>,
) -> Result<Revision, Error> {
warn!("Re-downloading missing source distribution: {source}");
let cache_entry = entry.shard().entry(HTTP_REVISION);
let download = |response| {
async {
let hashes = self
.download_archive(response, source, filename, ext, entry.path(), hashes)
.await?;
if hashes != revision.hashes() {
return Err(Error::CacheHeal(source.to_string()));
}
Ok(revision.with_hashes(hashes))
}
.boxed_local()
.instrument(info_span!("download", source_dist = %source))
};
client
.managed(|client| async move {
client
.cached_client()
.skip_cache(Self::request(url.clone(), client)?, &cache_entry, download)
.await
.map_err(|err| match err {
CachedClientError::Callback(err) => err,
CachedClientError::Client(err) => Error::Client(err),
})
})
.await
}
/// Download and unzip a source distribution into the cache from an HTTP response. /// Download and unzip a source distribution into the cache from an HTTP response.
async fn download_archive( async fn download_archive(
&self, &self,
@ -1395,9 +1503,14 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
fs_err::tokio::create_dir_all(target.parent().expect("Cache entry to have parent")) fs_err::tokio::create_dir_all(target.parent().expect("Cache entry to have parent"))
.await .await
.map_err(Error::CacheWrite)?; .map_err(Error::CacheWrite)?;
rename_with_retry(extracted, target) if let Err(err) = rename_with_retry(extracted, target).await {
.await // If the directory already exists, accept it.
.map_err(Error::CacheWrite)?; if target.is_dir() {
warn!("Directory already exists: {}", target.display());
} else {
return Err(Error::CacheWrite(err));
}
}
Ok(hashes) Ok(hashes)
} }
@ -1448,9 +1561,14 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
fs_err::tokio::create_dir_all(target.parent().expect("Cache entry to have parent")) fs_err::tokio::create_dir_all(target.parent().expect("Cache entry to have parent"))
.await .await
.map_err(Error::CacheWrite)?; .map_err(Error::CacheWrite)?;
rename_with_retry(extracted, &target) if let Err(err) = rename_with_retry(extracted, target).await {
.await // If the directory already exists, accept it.
.map_err(Error::CacheWrite)?; if target.is_dir() {
warn!("Directory already exists: {}", target.display());
} else {
return Err(Error::CacheWrite(err));
}
}
Ok(hashes) Ok(hashes)
} }