From 051188dce0453732d8ce6730392cc31bceec1ce6 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Sat, 4 Nov 2023 08:46:42 -0700 Subject: [PATCH] Use separate representations for canonical repository vs. commit (#317) Given `https://github.com/pypa/package.git#subdirectory=pkg_a` and `https://github.com/pypa/package.git#subdirectory=pkg_b`, we want these to map to the same shared _resource_ (for locking and cloning), but different _packages_ (for determining whether the wheel already exists in the cache). As such, we need two distinct concepts for "canonical equality". Closes #316. --- crates/puffin-cache/src/canonical_url.rs | 147 +++++++++++++++--- crates/puffin-cache/src/lib.rs | 2 +- crates/puffin-cli/tests/pip_compile.rs | 37 +++++ ...ompile__compile_git_concurrent_access.snap | 23 +++ crates/puffin-git/src/source.rs | 4 +- crates/puffin-resolver/src/resolver.rs | 4 +- 6 files changed, 192 insertions(+), 25 deletions(-) create mode 100644 crates/puffin-cli/tests/snapshots/pip_compile__compile_git_concurrent_access.snap diff --git a/crates/puffin-cache/src/canonical_url.rs b/crates/puffin-cache/src/canonical_url.rs index 1abd2f28c..efb123699 100644 --- a/crates/puffin-cache/src/canonical_url.rs +++ b/crates/puffin-cache/src/canonical_url.rs @@ -2,15 +2,14 @@ use url::Url; use crate::cache_key::{CacheKey, CacheKeyHasher}; -/// A wrapper around `Url` which represents a "canonical" version of an -/// original URL. +/// A wrapper around `Url` which represents a "canonical" version of an original URL. /// -/// A "canonical" url is only intended for internal comparison purposes. It's -/// to help paper over mistakes such as depending on `github.com/foo/bar` vs. -/// `github.com/foo/bar.git`. This is **only** for internal purposes and -/// provides no means to actually read the underlying string value of the `Url` -/// it contains. This is intentional, because all fetching should still happen -/// within the context of the original URL. +/// A "canonical" url is only intended for internal comparison purposes. It's to help paper over +/// mistakes such as depending on `github.com/foo/bar` vs. `github.com/foo/bar.git`. +/// +/// This is **only** for internal purposes and provides no means to actually read the underlying +/// string value of the `Url` it contains. This is intentional, because all fetching should still +/// happen within the context of the original URL. #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] pub struct CanonicalUrl(Url); @@ -23,24 +22,13 @@ impl CanonicalUrl { url.path_segments_mut().unwrap().pop_if_empty(); } - // If a URL starts with a kind (like `git+`), remove it. - if let Some(suffix) = url.as_str().strip_prefix("git+") { - // If a Git URL ends in a reference (like a branch, tag, or commit), remove it. - if let Some((prefix, _)) = suffix.rsplit_once('@') { - url = prefix.parse().unwrap(); - } else { - url = suffix.parse().unwrap(); - } - } - // For GitHub URLs specifically, just lower-case everything. GitHub // treats both the same, but they hash differently, and we're gonna be // hashing them. This wants a more general solution, and also we're // almost certainly not using the same case conversion rules that GitHub // does. (See issue #84) if url.host_str() == Some("github.com") { - url = format!("https{}", &url[url::Position::AfterScheme..]) - .parse() + url.set_scheme(url.scheme().to_lowercase().as_str()) .unwrap(); let path = url.path().to_lowercase(); url.set_path(&path); @@ -60,6 +48,10 @@ impl CanonicalUrl { CanonicalUrl(url) } + + pub fn parse(url: &str) -> Result { + Ok(Self::new(&Url::parse(url)?)) + } } impl CacheKey for CanonicalUrl { @@ -69,3 +61,118 @@ impl CacheKey for CanonicalUrl { self.0.as_str().cache_key(state); } } + +/// Like [`CanonicalUrl`], but attempts to represent an underlying source repository, abstracting +/// away details like the specific commit or branch, or the subdirectory to build within the +/// repository. +/// +/// For example, `https://github.com/pypa/package.git#subdirectory=pkg_a` and +/// `https://github.com/pypa/package.git#subdirectory=pkg_b` would map to different +/// [`CanonicalUrl`] values, but the same [`RepositoryUrl`], since they map to the same +/// resource. +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +pub struct RepositoryUrl(Url); + +impl RepositoryUrl { + pub fn new(url: &Url) -> RepositoryUrl { + let mut url = CanonicalUrl::new(url).0; + + // If a Git URL ends in a reference (like a branch, tag, or commit), remove it. + if url.scheme().starts_with("git+") { + if let Some((prefix, _)) = url.as_str().rsplit_once('@') { + url = prefix.parse().unwrap(); + } + } + + // Drop any fragments and query parameters. + url.set_fragment(None); + url.set_query(None); + + RepositoryUrl(url) + } + + pub fn parse(url: &str) -> Result { + Ok(Self::new(&Url::parse(url)?)) + } +} + +impl CacheKey for RepositoryUrl { + fn cache_key(&self, state: &mut CacheKeyHasher) { + // `as_str` gives the serialisation of a url (which has a spec) and so insulates against + // possible changes in how the URL crate does hashing. + self.0.as_str().cache_key(state); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn canonical_url() -> Result<(), url::ParseError> { + // Two URLs should be considered equal regardless of the `.git` suffix. + assert_eq!( + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?, + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?, + ); + + // Two URLs should be _not_ considered equal if they point to different repositories. + assert_ne!( + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?, + CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?, + ); + + // Two URLs should _not_ be considered equal if they request different subdirectories. + assert_ne!( + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?, + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?, + ); + + // Two URLs should _not_ be considered equal if they request different commit tags. + assert_ne!( + CanonicalUrl::parse( + "git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0" + )?, + CanonicalUrl::parse( + "git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0" + )?, + ); + + Ok(()) + } + + #[test] + fn repository_url() -> Result<(), url::ParseError> { + // Two URLs should be considered equal regardless of the `.git` suffix. + assert_eq!( + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?, + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?, + ); + + // Two URLs should be _not_ considered equal if they point to different repositories. + assert_ne!( + CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?, + CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?, + ); + + // Two URLs should be considered equal if they map to the same repository, even if they + // request different subdirectories. + assert_eq!( + RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?, + RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?, + ); + + // Two URLs should be considered equal if they map to the same repository, even if they + // request different commit tags. + assert_eq!( + RepositoryUrl::parse( + "git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0" + )?, + RepositoryUrl::parse( + "git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0" + )?, + ); + + Ok(()) + } +} diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index 31da05a54..e266b5261 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -2,7 +2,7 @@ use std::hash::Hasher; use seahash::SeaHasher; -pub use canonical_url::CanonicalUrl; +pub use canonical_url::{CanonicalUrl, RepositoryUrl}; pub use digest::digest; mod cache_key; diff --git a/crates/puffin-cli/tests/pip_compile.rs b/crates/puffin-cli/tests/pip_compile.rs index c0c901417..d791d7ab0 100644 --- a/crates/puffin-cli/tests/pip_compile.rs +++ b/crates/puffin-cli/tests/pip_compile.rs @@ -885,6 +885,43 @@ fn compile_git_subdirectory_dependency() -> Result<()> { Ok(()) } +/// Resolve two packages from a `requirements.in` file with the same Git HTTPS dependency. +#[test] +fn compile_git_concurrent_access() -> Result<()> { + let temp_dir = assert_fs::TempDir::new()?; + let cache_dir = assert_fs::TempDir::new()?; + let venv = temp_dir.child(".venv"); + + Command::new(get_cargo_bin(BIN_NAME)) + .arg("venv") + .arg(venv.as_os_str()) + .arg("--cache-dir") + .arg(cache_dir.path()) + .current_dir(&temp_dir) + .assert() + .success(); + venv.assert(predicates::path::is_dir()); + + let requirements_in = temp_dir.child("requirements.in"); + requirements_in.touch()?; + requirements_in + .write_str("example-pkg-a @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_a\nexample-pkg-b @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_b")?; + + insta::with_settings!({ + filters => INSTA_FILTERS.to_vec() + }, { + assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME)) + .arg("pip-compile") + .arg("requirements.in") + .arg("--cache-dir") + .arg(cache_dir.path()) + .env("VIRTUAL_ENV", venv.as_os_str()) + .current_dir(&temp_dir)); + }); + + Ok(()) +} + /// Resolve a Git dependency with a declared name that differs from the true name of the package. #[test] fn compile_git_mismatched_name() -> Result<()> { diff --git a/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_concurrent_access.snap b/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_concurrent_access.snap new file mode 100644 index 000000000..1d3d5df3c --- /dev/null +++ b/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_concurrent_access.snap @@ -0,0 +1,23 @@ +--- +source: crates/puffin-cli/tests/pip_compile.rs +info: + program: puffin + args: + - pip-compile + - requirements.in + - "--cache-dir" + - /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmp41ygCm + env: + VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpBfaYxl/.venv +--- +success: true +exit_code: 0 +----- stdout ----- +# This file was autogenerated by Puffin v0.0.1 via the following command: +# [BIN_PATH] pip-compile requirements.in --cache-dir [CACHE_DIR] +example-pkg-a @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_a +example-pkg-b @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_b + +----- stderr ----- +Resolved 2 packages in [TIME] + diff --git a/crates/puffin-git/src/source.rs b/crates/puffin-git/src/source.rs index 7b487ed14..6f4fae711 100644 --- a/crates/puffin-git/src/source.rs +++ b/crates/puffin-git/src/source.rs @@ -7,7 +7,7 @@ use anyhow::Result; use reqwest::Client; use tracing::debug; -use puffin_cache::{digest, CanonicalUrl}; +use puffin_cache::{digest, RepositoryUrl}; use crate::git::GitRemote; use crate::{FetchStrategy, Git}; @@ -36,7 +36,7 @@ impl GitSource { pub fn fetch(self) -> Result { // The path to the repo, within the Git database. - let ident = digest(&CanonicalUrl::new(&self.git.url)); + let ident = digest(&RepositoryUrl::new(&self.git.url)); let db_path = self.cache.join("db").join(&ident); let remote = GitRemote::new(&self.git.url); diff --git a/crates/puffin-resolver/src/resolver.rs b/crates/puffin-resolver/src/resolver.rs index 88accb14b..95e37a96d 100644 --- a/crates/puffin-resolver/src/resolver.rs +++ b/crates/puffin-resolver/src/resolver.rs @@ -22,7 +22,7 @@ use waitmap::WaitMap; use distribution_filename::{SourceDistributionFilename, WheelFilename}; use pep508_rs::{MarkerEnvironment, Requirement}; use platform_tags::Tags; -use puffin_cache::CanonicalUrl; +use puffin_cache::RepositoryUrl; use puffin_client::RegistryClient; use puffin_distribution::{RemoteDistributionRef, VersionOrUrl}; use puffin_normalize::{ExtraName, PackageName}; @@ -923,7 +923,7 @@ impl Locks { /// Acquire a lock on the given resource. async fn acquire(&self, url: &Url) -> Arc> { let mut map = self.0.lock().await; - map.entry(puffin_cache::digest(&CanonicalUrl::new(url))) + map.entry(puffin_cache::digest(&RepositoryUrl::new(url))) .or_insert_with(|| Arc::new(Mutex::new(()))) .clone() }