Prune unused source distributions from the cache (#7112)

## Summary

This has bothered me for a while and should be fairly impactful for
users. It requires a weird implementation, since the
distribution-building crate depends on the cache, and so the prune
operation can't live in the cache, since it needs to access internals of
the distribution-building crate.

Closes https://github.com/astral-sh/uv/issues/7096.
This commit is contained in:
Charlie Marsh 2024-09-05 21:40:51 -04:00 committed by GitHub
parent 1422e18674
commit 93fe3e83be
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 213 additions and 10 deletions

1
Cargo.lock generated
View file

@ -4812,6 +4812,7 @@ dependencies = [
"uv-types",
"uv-warnings",
"uv-workspace",
"walkdir",
"zip",
]

View file

@ -18,7 +18,7 @@ use uv_normalize::PackageName;
pub use crate::by_timestamp::CachedByTimestamp;
#[cfg(feature = "clap")]
pub use crate::cli::CacheArgs;
use crate::removal::{rm_rf, Removal};
pub use crate::removal::{rm_rf, Removal};
pub use crate::timestamp::Timestamp;
pub use crate::wheel::WheelCache;
use crate::wheel::WheelCacheKind;
@ -458,9 +458,7 @@ impl Cache {
}
}
// Third, remove any unused archives (by searching for archives that are not symlinked).
// TODO(charlie): Remove any unused source distributions. This requires introspecting the
// cache contents, e.g., reading and deserializing the manifests.
// Fourth, remove any unused archives (by searching for archives that are not symlinked).
let mut references = FxHashSet::default();
for bucket in CacheBucket::iter() {

View file

@ -7,7 +7,7 @@ use std::path::Path;
/// Remove a file or directory and all its contents, returning a [`Removal`] with
/// the number of files and directories removed, along with a total byte count.
pub(crate) fn rm_rf(path: impl AsRef<Path>) -> io::Result<Removal> {
pub fn rm_rf(path: impl AsRef<Path>) -> io::Result<Removal> {
let mut removal = Removal::default();
removal.rm_rf(path.as_ref())?;
Ok(removal)

View file

@ -46,6 +46,7 @@ tokio = { workspace = true }
tokio-util = { workspace = true, features = ["compat"] }
tracing = { workspace = true }
url = { workspace = true }
walkdir = { workspace = true }
zip = { workspace = true }
[dev-dependencies]

View file

@ -44,6 +44,8 @@ pub enum Error {
CacheDecode(#[from] rmp_serde::decode::Error),
#[error("Failed to serialize cache entry")]
CacheEncode(#[from] rmp_serde::encode::Error),
#[error("Failed to walk the distribution cache")]
CacheWalk(#[source] walkdir::Error),
// Build error
#[error(transparent)]

View file

@ -4,6 +4,7 @@ pub use error::Error;
pub use index::{BuiltWheelIndex, RegistryWheelIndex};
pub use metadata::{ArchiveMetadata, LoweredRequirement, Metadata, RequiresDist};
pub use reporter::Reporter;
pub use source::prune;
mod archive;
mod distribution_database;

View file

@ -22,7 +22,8 @@ use install_wheel_rs::metadata::read_archive_metadata;
use platform_tags::Tags;
use pypi_types::{HashDigest, Metadata12, Metadata23, RequiresTxt};
use uv_cache::{
ArchiveTimestamp, CacheBucket, CacheEntry, CacheShard, CachedByTimestamp, Timestamp, WheelCache,
ArchiveTimestamp, Cache, CacheBucket, CacheEntry, CacheShard, CachedByTimestamp, Removal,
Timestamp, WheelCache,
};
use uv_client::{
CacheControl, CachedClientError, Connectivity, DataWithCachePolicy, RegistryClient,
@ -1610,6 +1611,78 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
}
}
/// Prune any unused source distributions from the cache.
pub fn prune(cache: &Cache) -> Result<Removal, Error> {
let mut removal = Removal::default();
let bucket = cache.bucket(CacheBucket::SourceDistributions);
if bucket.is_dir() {
for entry in walkdir::WalkDir::new(bucket) {
let entry = entry.map_err(Error::CacheWalk)?;
// If we find a `revision.http` file, read the pointer, and remove any extraneous
// directories.
if entry.file_name() == "revision.http" {
let pointer = HttpRevisionPointer::read_from(entry.path())?;
if let Some(pointer) = pointer {
// Remove all sibling directories that are not referenced by the pointer.
for sibling in entry
.path()
.parent()
.unwrap()
.read_dir()
.map_err(Error::CacheRead)?
{
let sibling = sibling.map_err(Error::CacheRead)?;
if sibling.file_type().map_err(Error::CacheRead)?.is_dir() {
let sibling_name = sibling.file_name();
if sibling_name != pointer.revision.id().as_str() {
debug!(
"Removing dangling source revision: {}",
sibling.path().display()
);
removal +=
uv_cache::rm_rf(sibling.path()).map_err(Error::CacheWrite)?;
}
}
}
}
}
// If we find a `revision.rev` file, read the pointer, and remove any extraneous
// directories.
if entry.file_name() == "revision.rev" {
let pointer = LocalRevisionPointer::read_from(entry.path())?;
if let Some(pointer) = pointer {
// Remove all sibling directories that are not referenced by the pointer.
for sibling in entry
.path()
.parent()
.unwrap()
.read_dir()
.map_err(Error::CacheRead)?
{
let sibling = sibling.map_err(Error::CacheRead)?;
if sibling.file_type().map_err(Error::CacheRead)?.is_dir() {
let sibling_name = sibling.file_name();
if sibling_name != pointer.revision.id().as_str() {
debug!(
"Removing dangling source revision: {}",
sibling.path().display()
);
removal +=
uv_cache::rm_rf(sibling.path()).map_err(Error::CacheWrite)?;
}
}
}
}
}
}
}
Ok(removal)
}
/// Validate that the source distribution matches the built metadata.
fn validate(source: &BuildableSource<'_>, metadata: &Metadata23) -> Result<(), Error> {
if let Some(name) = source.name() {

View file

@ -63,6 +63,16 @@ impl RevisionId {
fn new() -> Self {
Self(nanoid::nanoid!())
}
pub(crate) fn as_str(&self) -> &str {
self.0.as_str()
}
}
impl AsRef<str> for RevisionId {
fn as_ref(&self) -> &str {
self.0.as_ref()
}
}
impl AsRef<Path> for RevisionId {

View file

@ -3,7 +3,7 @@ use std::fmt::Write;
use anyhow::{Context, Result};
use owo_colors::OwoColorize;
use uv_cache::Cache;
use uv_cache::{Cache, Removal};
use uv_fs::Simplified;
use crate::commands::{human_readable_bytes, ExitStatus};
@ -26,7 +26,14 @@ pub(crate) fn cache_prune(ci: bool, cache: &Cache, printer: Printer) -> Result<E
cache.root().user_display().cyan()
)?;
let summary = cache
let mut summary = Removal::default();
// Prune the source distribution cache, which is tightly coupled to the builder crate.
summary += uv_distribution::prune(cache)
.with_context(|| format!("Failed to prune cache at: {}", cache.root().user_display()))?;
// Prune the remaining cache buckets.
summary += cache
.prune(ci)
.with_context(|| format!("Failed to prune cache at: {}", cache.root().user_display()))?;

View file

@ -106,7 +106,7 @@ fn prune_cached_env() {
.chain([
// The cache entry does not have a stable key, so we filter it out
(
r"\[CACHE_DIR\](\\|\/)(.+)(\\|\/).*",
r"\[CACHE_DIR\](\\|\/)(.*?)(\\|\/).*",
"[CACHE_DIR]/$2/[ENTRY]",
),
])
@ -151,7 +151,7 @@ fn prune_stale_symlink() -> Result<()> {
.chain([
// The cache entry does not have a stable key, so we filter it out
(
r"\[CACHE_DIR\](\\|\/)(.+)(\\|\/).*",
r"\[CACHE_DIR\](\\|\/)(.*?)(\\|\/).*",
"[CACHE_DIR]/$2/[ENTRY]",
),
])
@ -252,3 +252,113 @@ fn prune_unzipped() -> Result<()> {
Ok(())
}
/// `cache prune` should remove any stale source distribution revisions.
#[test]
fn prune_stale_revision() -> Result<()> {
let context = TestContext::new("3.12");
let pyproject_toml = context.temp_dir.child("pyproject.toml");
pyproject_toml.write_str(
r#"
[project]
name = "project"
version = "0.1.0"
requires-python = ">=3.12"
dependencies = []
[build-system]
requires = ["setuptools>=42"]
build-backend = "setuptools.build_meta"
"#,
)?;
context.temp_dir.child("src").child("__init__.py").touch()?;
context.temp_dir.child("README").touch()?;
// Install the same package twice, with `--reinstall`.
uv_snapshot!(context.filters(), context
.pip_install()
.arg(".")
.arg("--reinstall"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Resolved 1 package in [TIME]
Prepared 1 package in [TIME]
Installed 1 package in [TIME]
+ project==0.1.0 (from file://[TEMP_DIR]/)
"###);
uv_snapshot!(context.filters(), context
.pip_install()
.arg(".")
.arg("--reinstall"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Resolved 1 package in [TIME]
Prepared 1 package in [TIME]
Uninstalled 1 package in [TIME]
Installed 1 package in [TIME]
~ project==0.1.0 (from file://[TEMP_DIR]/)
"###);
let filters: Vec<_> = context
.filters()
.into_iter()
.chain([
// The cache entry does not have a stable key, so we filter it out
(
r"\[CACHE_DIR\](\\|\/)(.*?)(\\|\/).*",
"[CACHE_DIR]/$2/[ENTRY]",
),
])
.collect();
// Pruning should remove the unused revision.
uv_snapshot!(&filters, context.prune().arg("--verbose"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
DEBUG uv [VERSION] ([COMMIT] DATE)
Pruning cache at: [CACHE_DIR]/
DEBUG Removing dangling source revision: [CACHE_DIR]/built-wheels-v3/[ENTRY]
DEBUG Removing dangling cache entry: [CACHE_DIR]/archive-v0/[ENTRY]
Removed 8 files ([SIZE])
"###);
// Uninstall and reinstall the package. We should use the cached version.
uv_snapshot!(context.filters(), context
.pip_uninstall()
.arg("."), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Uninstalled 1 package in [TIME]
- project==0.1.0 (from file://[TEMP_DIR]/)
"###);
uv_snapshot!(context.filters(), context
.pip_install()
.arg("."), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Resolved 1 package in [TIME]
Installed 1 package in [TIME]
+ project==0.1.0 (from file://[TEMP_DIR]/)
"###);
Ok(())
}