Use scheme parsing to determine absolute vs. relative URLs (#2904)

## Summary

We have a heuristic in `File` that attempts to detect whether a URL is
absolute or relative. However, `contains("://")` is prone to false
positive. In the linked issues, the URLs look like:

```
/packages/5a/d8/4d75d1e4287ad9d051aab793c68f902c9c55c4397636b5ee540ebd15aedf/pytz-2005k.tar.bz2?hash=597b596dc1c2c130cd0a57a043459c3bd6477c640c07ac34ca3ce8eed7e6f30c&remote=4d75d1e428/pytz-2005k.tar.bz2 (sha256)=597b596dc1c2c130cd0a57a043459c3bd6477c640c07ac34ca3ce8eed7e6f30c
```

Which is relative, but includes `://`.

Instead, we should determine whether the URL has a _scheme_ which
matches the `Url` crate internally.

Closes https://github.com/astral-sh/uv/issues/2899.
This commit is contained in:
Charlie Marsh 2024-04-08 17:04:27 -04:00 committed by GitHub
parent bdeab55193
commit cc3c5700e1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 19 additions and 34 deletions

View file

@ -6,6 +6,7 @@ use thiserror::Error;
use url::Url;
use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError};
use pep508_rs::split_scheme;
use pypi_types::{DistInfoMetadata, Hashes, Yanked};
/// Error converting [`pypi_types::File`] to [`distribution_type::File`].
@ -51,10 +52,12 @@ impl File {
.map_err(|err| FileConversionError::RequiresPython(err.line().clone(), err))?,
size: file.size,
upload_time_utc_ms: file.upload_time.map(|dt| dt.timestamp_millis()),
url: if file.url.contains("://") {
FileLocation::AbsoluteUrl(file.url)
} else {
FileLocation::RelativeUrl(base.to_string(), file.url)
url: {
if split_scheme(&file.url).is_some() {
FileLocation::AbsoluteUrl(file.url)
} else {
FileLocation::RelativeUrl(base.to_string(), file.url)
}
},
yanked: file.yanked,
})

View file

@ -1,37 +1,19 @@
use serde::{Deserialize, Serialize};
use url::Url;
/// Join a possibly relative URL to a base URL.
///
/// When `maybe_relative` is not relative, then it is parsed and returned with
/// `base` being ignored.
///
/// This is useful for parsing URLs that may be absolute or relative, with a
/// known base URL, and that doesn't require having already parsed a `BaseUrl`.
pub fn base_url_join_relative(base: &str, maybe_relative: &str) -> Result<Url, JoinRelativeError> {
match Url::parse(maybe_relative) {
Ok(absolute) => Ok(absolute),
Err(err) => {
if err == url::ParseError::RelativeUrlWithoutBase {
let base_url = Url::parse(base).map_err(|err| JoinRelativeError::ParseError {
original: base.to_string(),
source: err,
})?;
/// Join a relative URL to a base URL.
pub fn base_url_join_relative(base: &str, relative: &str) -> Result<Url, JoinRelativeError> {
let base_url = Url::parse(base).map_err(|err| JoinRelativeError::ParseError {
original: base.to_string(),
source: err,
})?;
base_url
.join(maybe_relative)
.map_err(|_| JoinRelativeError::ParseError {
original: format!("{base}/{maybe_relative}"),
source: err,
})
} else {
Err(JoinRelativeError::ParseError {
original: maybe_relative.to_string(),
source: err,
})
}
}
}
base_url
.join(relative)
.map_err(|err| JoinRelativeError::ParseError {
original: format!("{base}/{relative}"),
source: err,
})
}
/// An error that occurs when `base_url_join_relative` fails.