mirror of
https://github.com/astral-sh/uv.git
synced 2025-08-04 10:58:28 +00:00
Use separate representations for canonical repository vs. commit (#317)
Given `https://github.com/pypa/package.git#subdirectory=pkg_a` and `https://github.com/pypa/package.git#subdirectory=pkg_b`, we want these to map to the same shared _resource_ (for locking and cloning), but different _packages_ (for determining whether the wheel already exists in the cache). As such, we need two distinct concepts for "canonical equality". Closes #316.
This commit is contained in:
parent
b589813e59
commit
051188dce0
6 changed files with 192 additions and 25 deletions
|
@ -2,15 +2,14 @@ use url::Url;
|
|||
|
||||
use crate::cache_key::{CacheKey, CacheKeyHasher};
|
||||
|
||||
/// A wrapper around `Url` which represents a "canonical" version of an
|
||||
/// original URL.
|
||||
/// A wrapper around `Url` which represents a "canonical" version of an original URL.
|
||||
///
|
||||
/// A "canonical" url is only intended for internal comparison purposes. It's
|
||||
/// to help paper over mistakes such as depending on `github.com/foo/bar` vs.
|
||||
/// `github.com/foo/bar.git`. This is **only** for internal purposes and
|
||||
/// provides no means to actually read the underlying string value of the `Url`
|
||||
/// it contains. This is intentional, because all fetching should still happen
|
||||
/// within the context of the original URL.
|
||||
/// A "canonical" url is only intended for internal comparison purposes. It's to help paper over
|
||||
/// mistakes such as depending on `github.com/foo/bar` vs. `github.com/foo/bar.git`.
|
||||
///
|
||||
/// This is **only** for internal purposes and provides no means to actually read the underlying
|
||||
/// string value of the `Url` it contains. This is intentional, because all fetching should still
|
||||
/// happen within the context of the original URL.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct CanonicalUrl(Url);
|
||||
|
||||
|
@ -23,24 +22,13 @@ impl CanonicalUrl {
|
|||
url.path_segments_mut().unwrap().pop_if_empty();
|
||||
}
|
||||
|
||||
// If a URL starts with a kind (like `git+`), remove it.
|
||||
if let Some(suffix) = url.as_str().strip_prefix("git+") {
|
||||
// If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
|
||||
if let Some((prefix, _)) = suffix.rsplit_once('@') {
|
||||
url = prefix.parse().unwrap();
|
||||
} else {
|
||||
url = suffix.parse().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// For GitHub URLs specifically, just lower-case everything. GitHub
|
||||
// treats both the same, but they hash differently, and we're gonna be
|
||||
// hashing them. This wants a more general solution, and also we're
|
||||
// almost certainly not using the same case conversion rules that GitHub
|
||||
// does. (See issue #84)
|
||||
if url.host_str() == Some("github.com") {
|
||||
url = format!("https{}", &url[url::Position::AfterScheme..])
|
||||
.parse()
|
||||
url.set_scheme(url.scheme().to_lowercase().as_str())
|
||||
.unwrap();
|
||||
let path = url.path().to_lowercase();
|
||||
url.set_path(&path);
|
||||
|
@ -60,6 +48,10 @@ impl CanonicalUrl {
|
|||
|
||||
CanonicalUrl(url)
|
||||
}
|
||||
|
||||
pub fn parse(url: &str) -> Result<Self, url::ParseError> {
|
||||
Ok(Self::new(&Url::parse(url)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl CacheKey for CanonicalUrl {
|
||||
|
@ -69,3 +61,118 @@ impl CacheKey for CanonicalUrl {
|
|||
self.0.as_str().cache_key(state);
|
||||
}
|
||||
}
|
||||
|
||||
/// Like [`CanonicalUrl`], but attempts to represent an underlying source repository, abstracting
|
||||
/// away details like the specific commit or branch, or the subdirectory to build within the
|
||||
/// repository.
|
||||
///
|
||||
/// For example, `https://github.com/pypa/package.git#subdirectory=pkg_a` and
|
||||
/// `https://github.com/pypa/package.git#subdirectory=pkg_b` would map to different
|
||||
/// [`CanonicalUrl`] values, but the same [`RepositoryUrl`], since they map to the same
|
||||
/// resource.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct RepositoryUrl(Url);
|
||||
|
||||
impl RepositoryUrl {
|
||||
pub fn new(url: &Url) -> RepositoryUrl {
|
||||
let mut url = CanonicalUrl::new(url).0;
|
||||
|
||||
// If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
|
||||
if url.scheme().starts_with("git+") {
|
||||
if let Some((prefix, _)) = url.as_str().rsplit_once('@') {
|
||||
url = prefix.parse().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Drop any fragments and query parameters.
|
||||
url.set_fragment(None);
|
||||
url.set_query(None);
|
||||
|
||||
RepositoryUrl(url)
|
||||
}
|
||||
|
||||
pub fn parse(url: &str) -> Result<Self, url::ParseError> {
|
||||
Ok(Self::new(&Url::parse(url)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl CacheKey for RepositoryUrl {
|
||||
fn cache_key(&self, state: &mut CacheKeyHasher) {
|
||||
// `as_str` gives the serialisation of a url (which has a spec) and so insulates against
|
||||
// possible changes in how the URL crate does hashing.
|
||||
self.0.as_str().cache_key(state);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn canonical_url() -> Result<(), url::ParseError> {
|
||||
// Two URLs should be considered equal regardless of the `.git` suffix.
|
||||
assert_eq!(
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
|
||||
);
|
||||
|
||||
// Two URLs should be _not_ considered equal if they point to different repositories.
|
||||
assert_ne!(
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
|
||||
);
|
||||
|
||||
// Two URLs should _not_ be considered equal if they request different subdirectories.
|
||||
assert_ne!(
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?,
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?,
|
||||
);
|
||||
|
||||
// Two URLs should _not_ be considered equal if they request different commit tags.
|
||||
assert_ne!(
|
||||
CanonicalUrl::parse(
|
||||
"git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
|
||||
)?,
|
||||
CanonicalUrl::parse(
|
||||
"git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
|
||||
)?,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn repository_url() -> Result<(), url::ParseError> {
|
||||
// Two URLs should be considered equal regardless of the `.git` suffix.
|
||||
assert_eq!(
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
|
||||
);
|
||||
|
||||
// Two URLs should be _not_ considered equal if they point to different repositories.
|
||||
assert_ne!(
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
|
||||
CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
|
||||
);
|
||||
|
||||
// Two URLs should be considered equal if they map to the same repository, even if they
|
||||
// request different subdirectories.
|
||||
assert_eq!(
|
||||
RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a")?,
|
||||
RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b")?,
|
||||
);
|
||||
|
||||
// Two URLs should be considered equal if they map to the same repository, even if they
|
||||
// request different commit tags.
|
||||
assert_eq!(
|
||||
RepositoryUrl::parse(
|
||||
"git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
|
||||
)?,
|
||||
RepositoryUrl::parse(
|
||||
"git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
|
||||
)?,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ use std::hash::Hasher;
|
|||
|
||||
use seahash::SeaHasher;
|
||||
|
||||
pub use canonical_url::CanonicalUrl;
|
||||
pub use canonical_url::{CanonicalUrl, RepositoryUrl};
|
||||
pub use digest::digest;
|
||||
|
||||
mod cache_key;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue