From ffd78d0821e91fc908f8d497b521f01a8ed436b6 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Tue, 26 Mar 2024 21:39:01 -0400 Subject: [PATCH] Add an in-memory cache for Git references (#2682) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Ensures that, even if we try to resolve the same Git reference twice within an invocation, it always returns a (cached) consistent result. Closes https://github.com/astral-sh/uv/issues/2673. ## Test Plan ``` ❯ cargo run pip install git+https://github.com/pallets/flask.git --reinstall --no-cache Compiling uv-distribution v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-distribution) Compiling uv-resolver v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-resolver) Compiling uv-installer v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-installer) Compiling uv-dispatch v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-dispatch) Compiling uv-requirements v0.1.0 (/Users/crmarsh/workspace/uv/crates/uv-requirements) Compiling uv v0.1.24 (/Users/crmarsh/workspace/uv/crates/uv) Finished dev [unoptimized + debuginfo] target(s) in 3.95s Running `target/debug/uv pip install 'git+https://github.com/pallets/flask.git' --reinstall --no-cache` Updated https://github.com/pallets/flask.git (b90a4f1) Resolved 7 packages in 280ms Built flask @ git+https://github.com/pallets/flask.git@b90a4f1f4a370e92054b9cc9db0efcb864f87ebe Downloaded 7 packages in 212ms Installed 7 packages in 9ms ``` --- Cargo.lock | 1 + crates/uv-distribution/Cargo.toml | 4 +- .../src/distribution_database.rs | 51 +------ crates/uv-distribution/src/git.rs | 141 ++++++++++++++++++ crates/uv-distribution/src/lib.rs | 1 + crates/uv-distribution/src/source/mod.rs | 43 +----- crates/uv-git/src/git.rs | 2 +- crates/uv-git/src/lib.rs | 2 +- crates/uv-git/src/sha.rs | 2 +- 9 files changed, 161 insertions(+), 86 deletions(-) create mode 100644 crates/uv-distribution/src/git.rs diff --git a/Cargo.lock b/Cargo.lock index 9f048263b..c0f65ce1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4570,6 +4570,7 @@ dependencies = [ "futures", "install-wheel-rs", "nanoid", + "once_cell", "pep440_rs", "pep508_rs", "platform-tags", diff --git a/crates/uv-distribution/Cargo.toml b/crates/uv-distribution/Cargo.toml index d235b8d46..42b8c9e12 100644 --- a/crates/uv-distribution/Cargo.toml +++ b/crates/uv-distribution/Cargo.toml @@ -20,6 +20,7 @@ install-wheel-rs = { workspace = true } pep440_rs = { workspace = true } pep508_rs = { workspace = true } platform-tags = { workspace = true } +pypi-types = { workspace = true } uv-cache = { workspace = true } uv-client = { workspace = true } uv-extract = { workspace = true } @@ -27,12 +28,12 @@ uv-fs = { workspace = true, features = ["tokio"] } uv-git = { workspace = true, features = ["vendored-openssl"] } uv-normalize = { workspace = true } uv-types = { workspace = true } -pypi-types = { workspace = true } anyhow = { workspace = true } fs-err = { workspace = true } futures = { workspace = true } nanoid = { workspace = true } +once_cell = { workspace = true } reqwest = { workspace = true } reqwest-middleware = { workspace = true } rmp-serde = { workspace = true } @@ -45,3 +46,4 @@ tokio-util = { workspace = true, features = ["compat"] } tracing = { workspace = true } url = { workspace = true } zip = { workspace = true } + diff --git a/crates/uv-distribution/src/distribution_database.rs b/crates/uv-distribution/src/distribution_database.rs index f818c0493..6f9d1be0b 100644 --- a/crates/uv-distribution/src/distribution_database.rs +++ b/crates/uv-distribution/src/distribution_database.rs @@ -11,19 +11,17 @@ use url::Url; use distribution_filename::WheelFilename; use distribution_types::{ - BuildableSource, BuiltDist, DirectGitUrl, Dist, FileLocation, IndexLocations, LocalEditable, - Name, SourceDist, + BuildableSource, BuiltDist, Dist, FileLocation, IndexLocations, LocalEditable, Name, }; use platform_tags::Tags; use pypi_types::Metadata23; use uv_cache::{ArchiveTarget, ArchiveTimestamp, Cache, CacheBucket, CacheEntry, WheelCache}; use uv_client::{CacheControl, CachedClientError, Connectivity, RegistryClient}; -use uv_git::GitSource; use uv_types::{BuildContext, NoBinary, NoBuild}; use crate::download::{BuiltWheel, UnzippedWheel}; +use crate::git::resolve_precise; use crate::locks::Locks; -use crate::reporter::Facade; use crate::{DiskWheel, Error, LocalWheel, Reporter, SourceDistCachedBuilder}; /// A cached high-level interface to convert distributions (a requirement resolved to a location) @@ -356,7 +354,12 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let _guard = lock.lock().await; // Insert the `precise` URL, if it exists. - let precise = self.precise(source_dist).await?; + let precise = resolve_precise( + source_dist, + self.build_context.cache(), + self.reporter.as_ref(), + ) + .await?; let source_dist = match precise.as_ref() { Some(url) => Cow::Owned(source_dist.clone().with_url(url.clone())), @@ -393,44 +396,6 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> Ok((LocalWheel::Built(built_wheel), metadata)) } - /// Given a remote source distribution, return a precise variant, if possible. - /// - /// For example, given a Git dependency with a reference to a branch or tag, return a URL - /// with a precise reference to the current commit of that branch or tag. - /// - /// This method takes into account various normalizations that are independent from the Git - /// layer. For example: removing `#subdirectory=pkg_dir`-like fragments, and removing `git+` - /// prefix kinds. - async fn precise(&self, dist: &SourceDist) -> Result, Error> { - let SourceDist::Git(source_dist) = dist else { - return Ok(None); - }; - let git_dir = self.build_context.cache().bucket(CacheBucket::Git); - - let DirectGitUrl { url, subdirectory } = - DirectGitUrl::try_from(source_dist.url.raw()).map_err(Error::Git)?; - - // If the commit already contains a complete SHA, short-circuit. - if url.precise().is_some() { - return Ok(None); - } - - // Fetch the precise SHA of the Git reference (which could be a branch, a tag, a partial - // commit, etc.). - let source = if let Some(reporter) = self.reporter.clone() { - GitSource::new(url, git_dir).with_reporter(Facade::from(reporter)) - } else { - GitSource::new(url, git_dir) - }; - let precise = tokio::task::spawn_blocking(move || source.fetch()) - .await? - .map_err(Error::Git)?; - let url = precise.into_git(); - - // Re-encode as a URL. - Ok(Some(Url::from(DirectGitUrl { url, subdirectory }))) - } - /// Stream a wheel from a URL, unzipping it into the cache as it's downloaded. async fn stream_wheel( &self, diff --git a/crates/uv-distribution/src/git.rs b/crates/uv-distribution/src/git.rs new file mode 100644 index 000000000..fcda0f8cc --- /dev/null +++ b/crates/uv-distribution/src/git.rs @@ -0,0 +1,141 @@ +use std::path::PathBuf; +use std::sync::{Arc, Mutex}; + +use anyhow::Result; +use fs_err::tokio as fs; +use once_cell::sync::Lazy; +use rustc_hash::FxHashMap; +use tracing::debug; +use url::Url; + +use distribution_types::{DirectGitUrl, SourceDist}; +use uv_cache::{Cache, CacheBucket}; +use uv_fs::LockedFile; +use uv_git::{Fetch, GitSource, GitUrl}; + +use crate::error::Error; +use crate::reporter::Facade; +use crate::Reporter; + +/// Global cache of resolved Git references. +/// +/// Used to ensure that a given Git URL is only resolved once, and that the resolved URL is +/// consistent across all invocations. (For example: if a Git URL refers to a branch, like `main`, +/// then the resolved URL should always refer to the same commit across the lifetime of the +/// process.) +static RESOLVED_GIT_REFS: Lazy>> = Lazy::new(Mutex::default); + +/// Download a source distribution from a Git repository. +pub(crate) async fn fetch_git_archive( + url: &Url, + cache: &Cache, + reporter: Option<&Arc>, +) -> Result<(Fetch, Option), Error> { + debug!("Fetching source distribution from Git: {url}"); + let git_dir = cache.bucket(CacheBucket::Git); + + // Avoid races between different processes, too. + let lock_dir = git_dir.join("locks"); + fs::create_dir_all(&lock_dir) + .await + .map_err(Error::CacheWrite)?; + let canonical_url = cache_key::CanonicalUrl::new(url); + let _lock = LockedFile::acquire( + lock_dir.join(cache_key::digest(&canonical_url)), + &canonical_url, + ) + .map_err(Error::CacheWrite)?; + + let DirectGitUrl { url, subdirectory } = DirectGitUrl::try_from(url).map_err(Error::Git)?; + + // Extract the resolved URL from the in-memory cache, to save a look-up in the fetch. + let url = { + let resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap(); + if let Some(resolved) = resolved_git_refs.get(&url) { + resolved.clone() + } else { + url + } + }; + + // Fetch the Git repository. + let source = if let Some(reporter) = reporter { + GitSource::new(url.clone(), git_dir).with_reporter(Facade::from(reporter.clone())) + } else { + GitSource::new(url.clone(), git_dir) + }; + let fetch = tokio::task::spawn_blocking(move || source.fetch()) + .await? + .map_err(Error::Git)?; + + // Insert the resolved URL into the in-memory cache. + { + let mut resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap(); + let precise = fetch.git().clone(); + resolved_git_refs.insert(url, precise); + } + + Ok((fetch, subdirectory)) +} + +/// Given a remote source distribution, return a precise variant, if possible. +/// +/// For example, given a Git dependency with a reference to a branch or tag, return a URL +/// with a precise reference to the current commit of that branch or tag. +/// +/// This method takes into account various normalizations that are independent from the Git +/// layer. For example: removing `#subdirectory=pkg_dir`-like fragments, and removing `git+` +/// prefix kinds. +pub(crate) async fn resolve_precise( + dist: &SourceDist, + cache: &Cache, + reporter: Option<&Arc>, +) -> Result, Error> { + let SourceDist::Git(source_dist) = dist else { + return Ok(None); + }; + let git_dir = cache.bucket(CacheBucket::Git); + + let DirectGitUrl { url, subdirectory } = + DirectGitUrl::try_from(source_dist.url.raw()).map_err(Error::Git)?; + + // If the Git reference already contains a complete SHA, short-circuit. + if url.precise().is_some() { + return Ok(None); + } + + // If the Git reference is in the in-memory cache, return it. + { + let resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap(); + if let Some(precise) = resolved_git_refs.get(&url) { + return Ok(Some(Url::from(DirectGitUrl { + url: precise.clone(), + subdirectory, + }))); + } + } + + // Fetch the precise SHA of the Git reference (which could be a branch, a tag, a partial + // commit, etc.). + let source = if let Some(reporter) = reporter { + GitSource::new(url.clone(), git_dir).with_reporter(Facade::from(reporter.clone())) + } else { + GitSource::new(url.clone(), git_dir) + }; + let fetch = tokio::task::spawn_blocking(move || source.fetch()) + .await? + .map_err(Error::Git)?; + let precise = fetch.into_git(); + + // Insert the resolved URL into the in-memory cache. + { + let mut resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap(); + resolved_git_refs.insert(url.clone(), precise.clone()); + } + + // Re-encode as a URL. + Ok(Some(Url::from(DirectGitUrl { + url: precise, + subdirectory, + }))) +} diff --git a/crates/uv-distribution/src/lib.rs b/crates/uv-distribution/src/lib.rs index 43d4c7054..e560ddccd 100644 --- a/crates/uv-distribution/src/lib.rs +++ b/crates/uv-distribution/src/lib.rs @@ -9,6 +9,7 @@ pub use unzip::Unzip; mod distribution_database; mod download; mod error; +mod git; mod index; mod locks; mod reporter; diff --git a/crates/uv-distribution/src/source/mod.rs b/crates/uv-distribution/src/source/mod.rs index 78689c401..b5eae480e 100644 --- a/crates/uv-distribution/src/source/mod.rs +++ b/crates/uv-distribution/src/source/mod.rs @@ -17,8 +17,8 @@ use zip::ZipArchive; use distribution_filename::WheelFilename; use distribution_types::{ - BuildableSource, DirectArchiveUrl, DirectGitUrl, Dist, FileLocation, GitSourceUrl, - LocalEditable, PathSourceDist, PathSourceUrl, RemoteSource, SourceDist, SourceUrl, + BuildableSource, DirectArchiveUrl, Dist, FileLocation, GitSourceUrl, LocalEditable, + PathSourceDist, PathSourceUrl, RemoteSource, SourceDist, SourceUrl, }; use install_wheel_rs::metadata::read_archive_metadata; use pep508_rs::Scheme; @@ -31,12 +31,11 @@ use uv_cache::{ use uv_client::{ CacheControl, CachedClientError, Connectivity, DataWithCachePolicy, RegistryClient, }; -use uv_fs::{write_atomic, LockedFile}; -use uv_git::{Fetch, GitSource}; +use uv_fs::write_atomic; use uv_types::{BuildContext, BuildKind, NoBuild, SourceBuildTrait}; use crate::error::Error; -use crate::reporter::Facade; +use crate::git::fetch_git_archive; use crate::source::built_wheel_metadata::BuiltWheelMetadata; use crate::source::manifest::Manifest; use crate::Reporter; @@ -1233,40 +1232,6 @@ async fn extract_archive(path: &Path, cache: &Cache) -> Result>, -) -> Result<(Fetch, Option), Error> { - debug!("Fetching source distribution from Git: {url}"); - let git_dir = cache.bucket(CacheBucket::Git); - - // Avoid races between different processes, too. - let lock_dir = git_dir.join("locks"); - fs::create_dir_all(&lock_dir) - .await - .map_err(Error::CacheWrite)?; - let canonical_url = cache_key::CanonicalUrl::new(url); - let _lock = LockedFile::acquire( - lock_dir.join(cache_key::digest(&canonical_url)), - &canonical_url, - ) - .map_err(Error::CacheWrite)?; - - let DirectGitUrl { url, subdirectory } = DirectGitUrl::try_from(url).map_err(Error::Git)?; - - let source = if let Some(reporter) = reporter { - GitSource::new(url, git_dir).with_reporter(Facade::from(reporter.clone())) - } else { - GitSource::new(url, git_dir) - }; - let fetch = tokio::task::spawn_blocking(move || source.fetch()) - .await? - .map_err(Error::Git)?; - Ok((fetch, subdirectory)) -} - /// Download and extract a source distribution from a URL. /// /// This function will download the source distribution from the given URL, and extract it into a diff --git a/crates/uv-git/src/git.rs b/crates/uv-git/src/git.rs index 20b0771a7..05e63f2ad 100644 --- a/crates/uv-git/src/git.rs +++ b/crates/uv-git/src/git.rs @@ -23,7 +23,7 @@ use crate::FetchStrategy; const CHECKOUT_READY_LOCK: &str = ".ok"; /// A reference to commit or commit-ish. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(crate) enum GitReference { /// From a branch. #[allow(unused)] diff --git a/crates/uv-git/src/lib.rs b/crates/uv-git/src/lib.rs index 682c59db6..7b6c28a6f 100644 --- a/crates/uv-git/src/lib.rs +++ b/crates/uv-git/src/lib.rs @@ -12,7 +12,7 @@ mod source; mod util; /// A URL reference to a Git repository. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct GitUrl { /// The URL of the Git repository, with any query parameters and fragments removed. repository: Url, diff --git a/crates/uv-git/src/sha.rs b/crates/uv-git/src/sha.rs index 86b910180..c9d64b50c 100644 --- a/crates/uv-git/src/sha.rs +++ b/crates/uv-git/src/sha.rs @@ -1,7 +1,7 @@ use std::str::FromStr; /// A complete Git SHA, i.e., a 40-character hexadecimal representation of a Git commit. -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub struct GitSha(git2::Oid); impl GitSha {