Use separate representations for canonical repository vs. commit (#317)

Given `https://github.com/pypa/package.git#subdirectory=pkg_a` and
`https://github.com/pypa/package.git#subdirectory=pkg_b`, we want these
to map to the same shared _resource_ (for locking and cloning), but
different _packages_ (for determining whether the wheel already exists
in the cache). As such, we need two distinct concepts for "canonical
equality".

Closes #316.
This commit is contained in:
Charlie Marsh 2023-11-04 08:46:42 -07:00 committed by GitHub
parent b589813e59
commit 051188dce0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 192 additions and 25 deletions

View file

@ -2,15 +2,14 @@ use url::Url;
use crate::cache_key::{CacheKey, CacheKeyHasher};
/// A wrapper around `Url` which represents a "canonical" version of an
/// original URL.
/// A wrapper around `Url` which represents a "canonical" version of an original URL.
///
/// A "canonical" url is only intended for internal comparison purposes. It's
/// to help paper over mistakes such as depending on `github.com/foo/bar` vs.
/// `github.com/foo/bar.git`. This is **only** for internal purposes and
/// provides no means to actually read the underlying string value of the `Url`
/// it contains. This is intentional, because all fetching should still happen
/// within the context of the original URL.
/// A "canonical" url is only intended for internal comparison purposes. It's to help paper over
/// mistakes such as depending on `github.com/foo/bar` vs. `github.com/foo/bar.git`.
///
/// This is **only** for internal purposes and provides no means to actually read the underlying
/// string value of the `Url` it contains. This is intentional, because all fetching should still
/// happen within the context of the original URL.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct CanonicalUrl(Url);
@ -23,24 +22,13 @@ impl CanonicalUrl {
url.path_segments_mut().unwrap().pop_if_empty();
}
// If a URL starts with a kind (like `git+`), remove it.
if let Some(suffix) = url.as_str().strip_prefix("git+") {
// If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
if let Some((prefix, _)) = suffix.rsplit_once('@') {
url = prefix.parse().unwrap();
} else {
url = suffix.parse().unwrap();
}
}
// For GitHub URLs specifically, just lower-case everything. GitHub
// treats both the same, but they hash differently, and we're gonna be
// hashing them. This wants a more general solution, and also we're
// almost certainly not using the same case conversion rules that GitHub
// does. (See issue #84)
if url.host_str() == Some("github.com") {
url = format!("https{}", &url[url::Position::AfterScheme..])
.parse()
url.set_scheme(url.scheme().to_lowercase().as_str())
.unwrap();
let path = url.path().to_lowercase();
url.set_path(&path);
@ -60,6 +48,10 @@ impl CanonicalUrl {
CanonicalUrl(url)
}
pub fn parse(url: &str) -> Result<Self, url::ParseError> {
Ok(Self::new(&Url::parse(url)?))
}
}
impl CacheKey for CanonicalUrl {
@ -69,3 +61,118 @@ impl CacheKey for CanonicalUrl {
self.0.as_str().cache_key(state);
}
}
/// Like [`CanonicalUrl`], but attempts to represent an underlying source repository, abstracting
/// away details like the specific commit or branch, or the subdirectory to build within the
/// repository.
///
/// For example, `https://github.com/pypa/package.git#subdirectory=pkg_a` and
/// `https://github.com/pypa/package.git#subdirectory=pkg_b` would map to different
/// [`CanonicalUrl`] values, but the same [`RepositoryUrl`], since they map to the same
/// resource.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct RepositoryUrl(Url);
impl RepositoryUrl {
pub fn new(url: &Url) -> RepositoryUrl {
let mut url = CanonicalUrl::new(url).0;
// If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
if url.scheme().starts_with("git+") {
if let Some((prefix, _)) = url.as_str().rsplit_once('@') {
url = prefix.parse().unwrap();
}
}
// Drop any fragments and query parameters.
url.set_fragment(None);
url.set_query(None);
RepositoryUrl(url)
}
pub fn parse(url: &str) -> Result<Self, url::ParseError> {
Ok(Self::new(&Url::parse(url)?))
}
}
impl CacheKey for RepositoryUrl {
fn cache_key(&self, state: &mut CacheKeyHasher) {
// `as_str` gives the serialisation of a url (which has a spec) and so insulates against
// possible changes in how the URL crate does hashing.
self.0.as_str().cache_key(state);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn canonical_url() -> Result<(), url::ParseError> {
// Two URLs should be considered equal regardless of the `.git` suffix.
assert_eq!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
);
// Two URLs should be _not_ considered equal if they point to different repositories.
assert_ne!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
);
// Two URLs should _not_ be considered equal if they request different subdirectories.
assert_ne!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?,
);
// Two URLs should _not_ be considered equal if they request different commit tags.
assert_ne!(
CanonicalUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
)?,
CanonicalUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
)?,
);
Ok(())
}
#[test]
fn repository_url() -> Result<(), url::ParseError> {
// Two URLs should be considered equal regardless of the `.git` suffix.
assert_eq!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
);
// Two URLs should be _not_ considered equal if they point to different repositories.
assert_ne!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
);
// Two URLs should be considered equal if they map to the same repository, even if they
// request different subdirectories.
assert_eq!(
RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?,
RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?,
);
// Two URLs should be considered equal if they map to the same repository, even if they
// request different commit tags.
assert_eq!(
RepositoryUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
)?,
RepositoryUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
)?,
);
Ok(())
}
}

View file

@ -2,7 +2,7 @@ use std::hash::Hasher;
use seahash::SeaHasher;
pub use canonical_url::CanonicalUrl;
pub use canonical_url::{CanonicalUrl, RepositoryUrl};
pub use digest::digest;
mod cache_key;

View file

@ -885,6 +885,43 @@ fn compile_git_subdirectory_dependency() -> Result<()> {
Ok(())
}
/// Resolve two packages from a `requirements.in` file with the same Git HTTPS dependency.
#[test]
fn compile_git_concurrent_access() -> Result<()> {
let temp_dir = assert_fs::TempDir::new()?;
let cache_dir = assert_fs::TempDir::new()?;
let venv = temp_dir.child(".venv");
Command::new(get_cargo_bin(BIN_NAME))
.arg("venv")
.arg(venv.as_os_str())
.arg("--cache-dir")
.arg(cache_dir.path())
.current_dir(&temp_dir)
.assert()
.success();
venv.assert(predicates::path::is_dir());
let requirements_in = temp_dir.child("requirements.in");
requirements_in.touch()?;
requirements_in
.write_str("example-pkg-a @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_a\nexample-pkg-b @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_b")?;
insta::with_settings!({
filters => INSTA_FILTERS.to_vec()
}, {
assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-compile")
.arg("requirements.in")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir));
});
Ok(())
}
/// Resolve a Git dependency with a declared name that differs from the true name of the package.
#[test]
fn compile_git_mismatched_name() -> Result<()> {

View file

@ -0,0 +1,23 @@
---
source: crates/puffin-cli/tests/pip_compile.rs
info:
program: puffin
args:
- pip-compile
- requirements.in
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmp41ygCm
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpBfaYxl/.venv
---
success: true
exit_code: 0
----- stdout -----
# This file was autogenerated by Puffin v0.0.1 via the following command:
# [BIN_PATH] pip-compile requirements.in --cache-dir [CACHE_DIR]
example-pkg-a @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_a
example-pkg-b @ git+https://github.com/pypa/sample-namespace-packages.git@df7530eeb8fa0cb7dbb8ecb28363e8e36bfa2f45#subdirectory=pkg_resources/pkg_b
----- stderr -----
Resolved 2 packages in [TIME]

View file

@ -7,7 +7,7 @@ use anyhow::Result;
use reqwest::Client;
use tracing::debug;
use puffin_cache::{digest, CanonicalUrl};
use puffin_cache::{digest, RepositoryUrl};
use crate::git::GitRemote;
use crate::{FetchStrategy, Git};
@ -36,7 +36,7 @@ impl GitSource {
pub fn fetch(self) -> Result<Fetch> {
// The path to the repo, within the Git database.
let ident = digest(&CanonicalUrl::new(&self.git.url));
let ident = digest(&RepositoryUrl::new(&self.git.url));
let db_path = self.cache.join("db").join(&ident);
let remote = GitRemote::new(&self.git.url);

View file

@ -22,7 +22,7 @@ use waitmap::WaitMap;
use distribution_filename::{SourceDistributionFilename, WheelFilename};
use pep508_rs::{MarkerEnvironment, Requirement};
use platform_tags::Tags;
use puffin_cache::CanonicalUrl;
use puffin_cache::RepositoryUrl;
use puffin_client::RegistryClient;
use puffin_distribution::{RemoteDistributionRef, VersionOrUrl};
use puffin_normalize::{ExtraName, PackageName};
@ -923,7 +923,7 @@ impl Locks {
/// Acquire a lock on the given resource.
async fn acquire(&self, url: &Url) -> Arc<Mutex<()>> {
let mut map = self.0.lock().await;
map.entry(puffin_cache::digest(&CanonicalUrl::new(url)))
map.entry(puffin_cache::digest(&RepositoryUrl::new(url)))
.or_insert_with(|| Arc::new(Mutex::new(())))
.clone()
}