Use separate representations for canonical repository vs. commit (#317)

Given `https://github.com/pypa/package.git#subdirectory=pkg_a` and
`https://github.com/pypa/package.git#subdirectory=pkg_b`, we want these
to map to the same shared _resource_ (for locking and cloning), but
different _packages_ (for determining whether the wheel already exists
in the cache). As such, we need two distinct concepts for "canonical
equality".

Closes #316.
This commit is contained in:
Charlie Marsh 2023-11-04 08:46:42 -07:00 committed by GitHub
parent b589813e59
commit 051188dce0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 192 additions and 25 deletions

View file

@ -2,15 +2,14 @@ use url::Url;
use crate::cache_key::{CacheKey, CacheKeyHasher};
/// A wrapper around `Url` which represents a "canonical" version of an
/// original URL.
/// A wrapper around `Url` which represents a "canonical" version of an original URL.
///
/// A "canonical" url is only intended for internal comparison purposes. It's
/// to help paper over mistakes such as depending on `github.com/foo/bar` vs.
/// `github.com/foo/bar.git`. This is **only** for internal purposes and
/// provides no means to actually read the underlying string value of the `Url`
/// it contains. This is intentional, because all fetching should still happen
/// within the context of the original URL.
/// A "canonical" url is only intended for internal comparison purposes. It's to help paper over
/// mistakes such as depending on `github.com/foo/bar` vs. `github.com/foo/bar.git`.
///
/// This is **only** for internal purposes and provides no means to actually read the underlying
/// string value of the `Url` it contains. This is intentional, because all fetching should still
/// happen within the context of the original URL.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct CanonicalUrl(Url);
@ -23,24 +22,13 @@ impl CanonicalUrl {
url.path_segments_mut().unwrap().pop_if_empty();
}
// If a URL starts with a kind (like `git+`), remove it.
if let Some(suffix) = url.as_str().strip_prefix("git+") {
// If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
if let Some((prefix, _)) = suffix.rsplit_once('@') {
url = prefix.parse().unwrap();
} else {
url = suffix.parse().unwrap();
}
}
// For GitHub URLs specifically, just lower-case everything. GitHub
// treats both the same, but they hash differently, and we're gonna be
// hashing them. This wants a more general solution, and also we're
// almost certainly not using the same case conversion rules that GitHub
// does. (See issue #84)
if url.host_str() == Some("github.com") {
url = format!("https{}", &url[url::Position::AfterScheme..])
.parse()
url.set_scheme(url.scheme().to_lowercase().as_str())
.unwrap();
let path = url.path().to_lowercase();
url.set_path(&path);
@ -60,6 +48,10 @@ impl CanonicalUrl {
CanonicalUrl(url)
}
pub fn parse(url: &str) -> Result<Self, url::ParseError> {
Ok(Self::new(&Url::parse(url)?))
}
}
impl CacheKey for CanonicalUrl {
@ -69,3 +61,118 @@ impl CacheKey for CanonicalUrl {
self.0.as_str().cache_key(state);
}
}
/// Like [`CanonicalUrl`], but attempts to represent an underlying source repository, abstracting
/// away details like the specific commit or branch, or the subdirectory to build within the
/// repository.
///
/// For example, `https://github.com/pypa/package.git#subdirectory=pkg_a` and
/// `https://github.com/pypa/package.git#subdirectory=pkg_b` would map to different
/// [`CanonicalUrl`] values, but the same [`RepositoryUrl`], since they map to the same
/// resource.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct RepositoryUrl(Url);
impl RepositoryUrl {
pub fn new(url: &Url) -> RepositoryUrl {
let mut url = CanonicalUrl::new(url).0;
// If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
if url.scheme().starts_with("git+") {
if let Some((prefix, _)) = url.as_str().rsplit_once('@') {
url = prefix.parse().unwrap();
}
}
// Drop any fragments and query parameters.
url.set_fragment(None);
url.set_query(None);
RepositoryUrl(url)
}
pub fn parse(url: &str) -> Result<Self, url::ParseError> {
Ok(Self::new(&Url::parse(url)?))
}
}
impl CacheKey for RepositoryUrl {
fn cache_key(&self, state: &mut CacheKeyHasher) {
// `as_str` gives the serialisation of a url (which has a spec) and so insulates against
// possible changes in how the URL crate does hashing.
self.0.as_str().cache_key(state);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn canonical_url() -> Result<(), url::ParseError> {
// Two URLs should be considered equal regardless of the `.git` suffix.
assert_eq!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
);
// Two URLs should be _not_ considered equal if they point to different repositories.
assert_ne!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
);
// Two URLs should _not_ be considered equal if they request different subdirectories.
assert_ne!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?,
);
// Two URLs should _not_ be considered equal if they request different commit tags.
assert_ne!(
CanonicalUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
)?,
CanonicalUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
)?,
);
Ok(())
}
#[test]
fn repository_url() -> Result<(), url::ParseError> {
// Two URLs should be considered equal regardless of the `.git` suffix.
assert_eq!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
);
// Two URLs should be _not_ considered equal if they point to different repositories.
assert_ne!(
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
);
// Two URLs should be considered equal if they map to the same repository, even if they
// request different subdirectories.
assert_eq!(
RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?,
RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?,
);
// Two URLs should be considered equal if they map to the same repository, even if they
// request different commit tags.
assert_eq!(
RepositoryUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
)?,
RepositoryUrl::parse(
"git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
)?,
);
Ok(())
}
}

View file

@ -2,7 +2,7 @@ use std::hash::Hasher;
use seahash::SeaHasher;
pub use canonical_url::CanonicalUrl;
pub use canonical_url::{CanonicalUrl, RepositoryUrl};
pub use digest::digest;
mod cache_key;