Add a garbage collection mechanism to the CLI (#1217)

## Summary

Detects unused cache entries, which can come in a few forms:

1. Directories that are out-dated via our versioning scheme.
2. Old source distribution builds (i.e., we have a more recent version).
3. Old wheels (stored in `archive-v0`, but not symlinked-to from
anywhere in the cache).

Closes https://github.com/astral-sh/puffin/issues/1059.
This commit is contained in:
Charlie Marsh 2024-03-21 14:07:48 -04:00 committed by GitHub
parent 7ee90dc71f
commit 0f96386032
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 348 additions and 28 deletions

2
Cargo.lock generated
View file

@ -4467,8 +4467,10 @@ dependencies = [
"distribution-types",
"fs-err",
"nanoid",
"rustc-hash",
"serde",
"tempfile",
"tracing",
"url",
"uv-fs",
"uv-normalize",

View file

@ -24,7 +24,9 @@ clap = { workspace = true, features = ["derive", "env"], optional = true }
directories = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] }
nanoid = { workspace = true }
rustc-hash = { workspace = true }
serde = { workspace = true, features = ["derive"] }
tempfile = { workspace = true }
tracing = { workspace = true }
url = { workspace = true }
walkdir = { workspace = true }

View file

@ -6,10 +6,12 @@ use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use distribution_types::InstalledDist;
use fs_err as fs;
use rustc_hash::FxHashSet;
use tempfile::{tempdir, TempDir};
use tracing::debug;
use distribution_types::InstalledDist;
use uv_fs::directories;
use uv_normalize::PackageName;
@ -283,17 +285,72 @@ impl Cache {
/// Returns the number of entries removed from the cache.
pub fn remove(&self, name: &PackageName) -> Result<Removal, io::Error> {
let mut summary = Removal::default();
for bucket in [
CacheBucket::Wheels,
CacheBucket::BuiltWheels,
CacheBucket::Git,
CacheBucket::Interpreter,
CacheBucket::Simple,
] {
for bucket in CacheBucket::iter() {
summary += bucket.remove(self, name)?;
}
Ok(summary)
}
/// Run the garbage collector on the cache, removing any dangling entries.
pub fn prune(&self) -> Result<Removal, io::Error> {
let mut summary = Removal::default();
// First, remove any top-level directories that are unused. These typically represent
// outdated cache buckets (e.g., `wheels-v0`, when latest is `wheels-v1`).
for entry in fs::read_dir(&self.root)? {
let entry = entry?;
let metadata = entry.metadata()?;
if entry.file_name() == "CACHEDIR.TAG"
|| entry.file_name() == ".gitignore"
|| entry.file_name() == ".git"
{
continue;
}
if metadata.is_dir() {
// If the directory is not a cache bucket, remove it.
if CacheBucket::iter().all(|bucket| entry.file_name() != bucket.to_str()) {
let path = entry.path();
debug!("Removing dangling cache entry: {}", path.display());
summary += rm_rf(path)?;
}
} else {
// If the file is not a marker file, remove it.
let path = entry.path();
debug!("Removing dangling cache entry: {}", path.display());
summary += rm_rf(path)?;
}
}
// Second, remove any unused archives (by searching for archives that are not symlinked).
// TODO(charlie): Remove any unused source distributions. This requires introspecting the
// cache contents, e.g., reading and deserializing the manifests.
let mut references = FxHashSet::default();
for bucket in CacheBucket::iter() {
let bucket = self.bucket(bucket);
if bucket.is_dir() {
for entry in walkdir::WalkDir::new(bucket) {
let entry = entry?;
if entry.file_type().is_symlink() {
references.insert(entry.path().canonicalize()?);
}
}
}
}
for entry in fs::read_dir(self.bucket(CacheBucket::Archive))? {
let entry = entry?;
let path = entry.path().canonicalize()?;
if !references.contains(&path) {
debug!("Removing dangling cache entry: {}", path.display());
summary += rm_rf(path)?;
}
}
Ok(summary)
}
}
/// The different kinds of data in the cache are stored in different bucket, which in our case
@ -633,6 +690,21 @@ impl CacheBucket {
}
Ok(summary)
}
/// Return an iterator over all cache buckets.
pub fn iter() -> impl Iterator<Item = CacheBucket> {
[
CacheBucket::Wheels,
CacheBucket::BuiltWheels,
CacheBucket::FlatIndex,
CacheBucket::Git,
CacheBucket::Interpreter,
CacheBucket::Simple,
CacheBucket::Archive,
]
.iter()
.copied()
}
}
impl Display for CacheBucket {

View file

@ -7,10 +7,10 @@ use uv_cache::Cache;
use uv_fs::Simplified;
use uv_normalize::PackageName;
use crate::commands::ExitStatus;
use crate::commands::{human_readable_bytes, ExitStatus};
use crate::printer::Printer;
/// Clear the cache.
/// Clear the cache, removing all entries or those linked to specific packages.
pub(crate) fn cache_clean(
packages: &[PackageName],
cache: &Cache,
@ -123,19 +123,3 @@ pub(crate) fn cache_clean(
Ok(ExitStatus::Success)
}
/// Formats a number of bytes into a human readable SI-prefixed size.
///
/// Returns a tuple of `(quantity, units)`.
#[allow(
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::cast_precision_loss,
clippy::cast_sign_loss
)]
fn human_readable_bytes(bytes: u64) -> (f32, &'static str) {
static UNITS: [&str; 7] = ["B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"];
let bytes = bytes as f32;
let i = ((bytes.log2() / 10.0) as usize).min(UNITS.len() - 1);
(bytes / 1024_f32.powi(i as i32), UNITS[i])
}

View file

@ -0,0 +1,66 @@
use std::fmt::Write;
use anyhow::{Context, Result};
use owo_colors::OwoColorize;
use uv_cache::Cache;
use uv_fs::Simplified;
use crate::commands::{human_readable_bytes, ExitStatus};
use crate::printer::Printer;
/// Prune all unreachable objects from the cache.
pub(crate) fn cache_prune(cache: &Cache, printer: Printer) -> Result<ExitStatus> {
if !cache.root().exists() {
writeln!(
printer.stderr(),
"No cache found at: {}",
cache.root().user_display().cyan()
)?;
return Ok(ExitStatus::Success);
}
writeln!(
printer.stderr(),
"Pruning cache at: {}",
cache.root().user_display().cyan()
)?;
let summary = cache
.prune()
.with_context(|| format!("Failed to prune cache at: {}", cache.root().user_display()))?;
// Write a summary of the number of files and directories removed.
match (summary.num_files, summary.num_dirs) {
(0, 0) => {
write!(printer.stderr(), "No unused entries found")?;
}
(0, 1) => {
write!(printer.stderr(), "Removed 1 directory")?;
}
(0, num_dirs_removed) => {
write!(printer.stderr(), "Removed {num_dirs_removed} directories")?;
}
(1, _) => {
write!(printer.stderr(), "Removed 1 file")?;
}
(num_files_removed, _) => {
write!(printer.stderr(), "Removed {num_files_removed} files")?;
}
}
// If any, write a summary of the total byte count removed.
if summary.total_bytes > 0 {
let bytes = if summary.total_bytes < 1024 {
format!("{}B", summary.total_bytes)
} else {
let (bytes, unit) = human_readable_bytes(summary.total_bytes);
format!("{bytes:.1}{unit}")
};
write!(printer.stderr(), " ({})", bytes.green())?;
}
writeln!(printer.stderr())?;
Ok(ExitStatus::Success)
}

View file

@ -6,6 +6,7 @@ use owo_colors::OwoColorize;
pub(crate) use cache_clean::cache_clean;
pub(crate) use cache_dir::cache_dir;
pub(crate) use cache_prune::cache_prune;
use distribution_types::InstalledMetadata;
pub(crate) use pip_check::pip_check;
pub(crate) use pip_compile::{extra_name_with_clap_error, pip_compile};
@ -28,6 +29,7 @@ use crate::printer::Printer;
mod cache_clean;
mod cache_dir;
mod cache_prune;
mod pip_check;
mod pip_compile;
mod pip_freeze;
@ -155,3 +157,19 @@ pub(super) async fn compile_bytecode(
)?;
Ok(())
}
/// Formats a number of bytes into a human readable SI-prefixed size.
///
/// Returns a tuple of `(quantity, units)`.
#[allow(
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::cast_precision_loss,
clippy::cast_sign_loss
)]
pub(super) fn human_readable_bytes(bytes: u64) -> (f32, &'static str) {
static UNITS: [&str; 7] = ["B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"];
let bytes = bytes as f32;
let i = ((bytes.log2() / 10.0) as usize).min(UNITS.len() - 1);
(bytes / 1024_f32.powi(i as i32), UNITS[i])
}

View file

@ -137,7 +137,7 @@ enum Commands {
/// Manage the `uv` executable.
#[clap(name = "self")]
Self_(SelfNamespace),
/// Remove all items from the cache.
/// Clear the cache, removing all entries or those linked to specific packages.
#[clap(hide = true)]
Clean(CleanArgs),
/// Display uv's version
@ -170,8 +170,10 @@ struct CacheNamespace {
#[derive(Subcommand)]
enum CacheCommand {
/// Remove all items from the cache.
/// Clear the cache, removing all entries or those linked to specific packages.
Clean(CleanArgs),
/// Prune all unreachable objects from the cache.
Prune,
/// Show the cache directory.
Dir,
}
@ -1759,6 +1761,9 @@ async fn run() -> Result<ExitStatus> {
command: CacheCommand::Clean(args),
})
| Commands::Clean(args) => commands::cache_clean(&args.package, &cache, printer),
Commands::Cache(CacheNamespace {
command: CacheCommand::Prune,
}) => commands::cache_prune(&cache, printer),
Commands::Cache(CacheNamespace {
command: CacheCommand::Dir,
}) => {

View file

@ -0,0 +1,171 @@
#![cfg(all(feature = "python", feature = "pypi"))]
use std::process::Command;
use anyhow::Result;
use assert_cmd::prelude::*;
use assert_fs::prelude::*;
use common::uv_snapshot;
use crate::common::{get_bin, TestContext, INSTA_FILTERS};
mod common;
/// Create a `cache prune` command with options shared across scenarios.
fn prune_command(context: &TestContext) -> Command {
let mut command = Command::new(get_bin());
command
.arg("cache")
.arg("prune")
.arg("--cache-dir")
.arg(context.cache_dir.path())
.env("VIRTUAL_ENV", context.venv.as_os_str())
.current_dir(&context.temp_dir);
if cfg!(all(windows, debug_assertions)) {
// TODO(konstin): Reduce stack usage in debug mode enough that the tests pass with the
// default windows stack of 1MB
command.env("UV_STACK_SIZE", (8 * 1024 * 1024).to_string());
}
command
}
/// Create a `pip sync` command with options shared across scenarios.
fn sync_command(context: &TestContext) -> Command {
let mut command = Command::new(get_bin());
command
.arg("pip")
.arg("sync")
.arg("--cache-dir")
.arg(context.cache_dir.path())
.env("VIRTUAL_ENV", context.venv.as_os_str())
.current_dir(&context.temp_dir);
if cfg!(all(windows, debug_assertions)) {
// TODO(konstin): Reduce stack usage in debug mode enough that the tests pass with the
// default windows stack of 1MB
command.env("UV_STACK_SIZE", (8 * 1024 * 1024).to_string());
}
command
}
/// `cache prune` should be a no-op if there's nothing out-of-date in the cache.
#[test]
fn prune_no_op() -> Result<()> {
let context = TestContext::new("3.12");
let requirements_txt = context.temp_dir.child("requirements.txt");
requirements_txt.write_str("anyio")?;
// Install a requirement, to populate the cache.
sync_command(&context)
.arg("requirements.txt")
.assert()
.success();
let filters = [(r"Pruning cache at: .*", "Pruning cache at: [CACHE_DIR]")]
.into_iter()
.chain(INSTA_FILTERS.to_vec())
.collect::<Vec<_>>();
uv_snapshot!(filters, prune_command(&context).arg("--verbose"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Pruning cache at: [CACHE_DIR]
No unused entries found
"###);
Ok(())
}
/// `cache prune` should remove any stale top-level directories from the cache.
#[test]
fn prune_stale_directory() -> Result<()> {
let context = TestContext::new("3.12");
let requirements_txt = context.temp_dir.child("requirements.txt");
requirements_txt.write_str("anyio")?;
// Install a requirement, to populate the cache.
sync_command(&context)
.arg("requirements.txt")
.assert()
.success();
// Add a stale directory to the cache.
let simple = context.cache_dir.child("simple-v4");
simple.create_dir_all()?;
let filters = [
(r"Pruning cache at: .*", "Pruning cache at: [CACHE_DIR]"),
(
r"Removing dangling cache entry: .*[\\|/]simple-v4",
"Pruning cache at: [CACHE_DIR]/simple-v4",
),
]
.into_iter()
.chain(INSTA_FILTERS.to_vec())
.collect::<Vec<_>>();
uv_snapshot!(filters, prune_command(&context).arg("--verbose"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Pruning cache at: [CACHE_DIR]
DEBUG Pruning cache at: [CACHE_DIR]/simple-v4
Removed 1 directory
"###);
Ok(())
}
/// `cache prune` should remove any stale symlink from the cache.
#[test]
fn prune_stale_symlink() -> Result<()> {
let context = TestContext::new("3.12");
let requirements_txt = context.temp_dir.child("requirements.txt");
requirements_txt.write_str("anyio")?;
// Install a requirement, to populate the cache.
sync_command(&context)
.arg("requirements.txt")
.assert()
.success();
// Remove the wheels directory, causing the symlink to become stale.
let wheels = context.cache_dir.child("wheels-v0");
fs_err::remove_dir_all(wheels)?;
let filters = [
(r"Pruning cache at: .*", "Pruning cache at: [CACHE_DIR]"),
(
r"Removing dangling cache entry: .*[\\|/]archive-v0[\\|/].*",
"Pruning cache at: [CACHE_DIR]/archive-v0/anyio",
),
]
.into_iter()
.chain(INSTA_FILTERS.to_vec())
.collect::<Vec<_>>();
uv_snapshot!(filters, prune_command(&context).arg("--verbose"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Pruning cache at: [CACHE_DIR]
DEBUG Pruning cache at: [CACHE_DIR]/archive-v0/anyio
Removed 44 files ([SIZE])
"###);
Ok(())
}