Avoid reparsing wheel URLs (#4947)

## Summary

We currently store wheel URLs in an unparsed state because we don't have
a stable parsed representation to use with rykv. Unfortunately this
means we end up reparsing unnecessarily in a lot of places, especially
when constructing a `Lock`. This PR adds a `UrlString` type that lets us
avoid reparsing without losing the validity of the `Url`.

## Test Plan

Shaves off another ~10 ms from
https://github.com/astral-sh/uv/issues/4860.

```
➜  transformers hyperfine "../../uv/target/profiling/uv lock" "../../uv/target/profiling/baseline lock" --warmup 3
Benchmark 1: ../../uv/target/profiling/uv lock
  Time (mean ± σ):     120.9 ms ±   2.5 ms    [User: 126.0 ms, System: 80.6 ms]
  Range (min … max):   116.8 ms … 125.7 ms    23 runs
 
Benchmark 2: ../../uv/target/profiling/baseline lock
  Time (mean ± σ):     129.9 ms ±   4.2 ms    [User: 127.1 ms, System: 86.1 ms]
  Range (min … max):   123.4 ms … 141.2 ms    23 runs

Summary
  ../../uv/target/profiling/uv lock ran
    1.07 ± 0.04 times faster than ../../uv/target/profiling/baseline lock
```
This commit is contained in:
Ibraheem Ahmed 2024-07-10 05:16:30 -04:00 committed by GitHub
parent 23eb42deed
commit d833910a5d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 84 additions and 44 deletions

View file

@ -1,11 +1,11 @@
use std::fmt::{Display, Formatter}; use std::fmt::{self, Display, Formatter};
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use url::Url; use url::Url;
use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError}; use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError};
use pep508_rs::split_scheme;
use pypi_types::{CoreMetadata, HashDigest, Yanked}; use pypi_types::{CoreMetadata, HashDigest, Yanked};
/// Error converting [`pypi_types::File`] to [`distribution_type::File`]. /// Error converting [`pypi_types::File`] to [`distribution_type::File`].
@ -56,10 +56,9 @@ impl File {
.map_err(|err| FileConversionError::RequiresPython(err.line().clone(), err))?, .map_err(|err| FileConversionError::RequiresPython(err.line().clone(), err))?,
size: file.size, size: file.size,
upload_time_utc_ms: file.upload_time.map(|dt| dt.timestamp_millis()), upload_time_utc_ms: file.upload_time.map(|dt| dt.timestamp_millis()),
url: if split_scheme(&file.url).is_some() { url: match Url::parse(&file.url) {
FileLocation::AbsoluteUrl(file.url) Ok(url) => FileLocation::AbsoluteUrl(url.into()),
} else { Err(_) => FileLocation::RelativeUrl(base.to_string(), file.url),
FileLocation::RelativeUrl(base.to_string(), file.url)
}, },
yanked: file.yanked, yanked: file.yanked,
}) })
@ -76,7 +75,7 @@ pub enum FileLocation {
/// URL relative to the base URL. /// URL relative to the base URL.
RelativeUrl(String, String), RelativeUrl(String, String),
/// Absolute URL. /// Absolute URL.
AbsoluteUrl(String), AbsoluteUrl(UrlString),
/// Absolute path to a file. /// Absolute path to a file.
Path(#[with(rkyv::with::AsString)] PathBuf), Path(#[with(rkyv::with::AsString)] PathBuf),
} }
@ -107,13 +106,7 @@ impl FileLocation {
})?; })?;
Ok(joined) Ok(joined)
} }
FileLocation::AbsoluteUrl(ref absolute) => { FileLocation::AbsoluteUrl(ref absolute) => Ok(absolute.to_url()),
let url = Url::parse(absolute).map_err(|err| ToUrlError::InvalidAbsolute {
absolute: absolute.clone(),
err,
})?;
Ok(url)
}
FileLocation::Path(ref path) => { FileLocation::Path(ref path) => {
let path = path let path = path
.to_str() .to_str()
@ -125,18 +118,79 @@ impl FileLocation {
} }
} }
} }
/// Convert this location to a URL.
///
/// This method is identical to [`FileLocation::to_url`] except it avoids parsing absolute URLs
/// as they are already guaranteed to be valid.
pub fn to_url_string(&self) -> Result<UrlString, ToUrlError> {
match *self {
FileLocation::AbsoluteUrl(ref absolute) => Ok(absolute.clone()),
_ => Ok(self.to_url()?.into()),
}
}
} }
impl Display for FileLocation { impl Display for FileLocation {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self { match self {
Self::RelativeUrl(_base, url) => Display::fmt(&url, f), Self::RelativeUrl(_base, url) => Display::fmt(&url, f),
Self::AbsoluteUrl(url) => Display::fmt(&url, f), Self::AbsoluteUrl(url) => Display::fmt(&url.0, f),
Self::Path(path) => Display::fmt(&path.display(), f), Self::Path(path) => Display::fmt(&path.display(), f),
} }
} }
} }
/// A [`Url`] represented as a `String`.
///
/// This type is guaranteed to be a valid URL but avoids being parsed into the [`Url`] type.
#[derive(
Debug,
Clone,
Serialize,
Deserialize,
rkyv::Archive,
rkyv::Deserialize,
rkyv::Serialize,
PartialEq,
Eq,
)]
#[archive(check_bytes)]
#[archive_attr(derive(Debug))]
pub struct UrlString(String);
impl UrlString {
/// Converts a [`UrlString`] to a [`Url`].
pub fn to_url(&self) -> Url {
// This conversion can never fail as the only way to construct a `UrlString` is from a `Url`.
Url::from_str(&self.0).unwrap()
}
}
impl AsRef<str> for UrlString {
fn as_ref(&self) -> &str {
&self.0
}
}
impl From<Url> for UrlString {
fn from(value: Url) -> Self {
UrlString(value.to_string())
}
}
impl From<UrlString> for String {
fn from(value: UrlString) -> Self {
value.0
}
}
impl fmt::Display for UrlString {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.0, f)
}
}
/// An error that occurs when a `FileLocation` is not a valid URL. /// An error that occurs when a `FileLocation` is not a valid URL.
#[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)] #[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)]
pub enum ToUrlError { pub enum ToUrlError {

View file

@ -421,7 +421,7 @@ impl RegistryClient {
} }
} }
FileLocation::AbsoluteUrl(url) => { FileLocation::AbsoluteUrl(url) => {
let url = Url::parse(url).map_err(ErrorKind::UrlParse)?; let url = url.to_url();
if url.scheme() == "file" { if url.scheme() == "file" {
let path = url let path = url
.to_file_path() .to_file_path()

View file

@ -160,9 +160,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
FileLocation::RelativeUrl(base, url) => { FileLocation::RelativeUrl(base, url) => {
pypi_types::base_url_join_relative(base, url)? pypi_types::base_url_join_relative(base, url)?
} }
FileLocation::AbsoluteUrl(url) => { FileLocation::AbsoluteUrl(url) => url.to_url(),
Url::parse(url).map_err(|err| Error::Url(url.clone(), err))?
}
FileLocation::Path(path) => { FileLocation::Path(path) => {
let cache_entry = self.build_context.cache().entry( let cache_entry = self.build_context.cache().entry(
CacheBucket::Wheels, CacheBucket::Wheels,

View file

@ -102,9 +102,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
FileLocation::RelativeUrl(base, url) => { FileLocation::RelativeUrl(base, url) => {
pypi_types::base_url_join_relative(base, url)? pypi_types::base_url_join_relative(base, url)?
} }
FileLocation::AbsoluteUrl(url) => { FileLocation::AbsoluteUrl(url) => url.to_url(),
Url::parse(url).map_err(|err| Error::Url(url.clone(), err))?
}
FileLocation::Path(path) => { FileLocation::Path(path) => {
let url = Url::from_file_path(path) let url = Url::from_file_path(path)
.map_err(|()| Error::RelativePath(path.clone()))?; .map_err(|()| Error::RelativePath(path.clone()))?;
@ -280,9 +278,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
FileLocation::RelativeUrl(base, url) => { FileLocation::RelativeUrl(base, url) => {
pypi_types::base_url_join_relative(base, url)? pypi_types::base_url_join_relative(base, url)?
} }
FileLocation::AbsoluteUrl(url) => { FileLocation::AbsoluteUrl(url) => url.to_url(),
Url::parse(url).map_err(|err| Error::Url(url.clone(), err))?
}
FileLocation::Path(path) => { FileLocation::Path(path) => {
let url = Url::from_file_path(path) let url = Url::from_file_path(path)
.map_err(|()| Error::RelativePath(path.clone()))?; .map_err(|()| Error::RelativePath(path.clone()))?;

View file

@ -17,7 +17,7 @@ use distribution_filename::WheelFilename;
use distribution_types::{ use distribution_types::{
BuiltDist, DirectUrlBuiltDist, DirectUrlSourceDist, DirectorySourceDist, Dist, FileLocation, BuiltDist, DirectUrlBuiltDist, DirectUrlSourceDist, DirectorySourceDist, Dist, FileLocation,
GitSourceDist, IndexUrl, PathBuiltDist, PathSourceDist, RegistryBuiltDist, RegistryBuiltWheel, GitSourceDist, IndexUrl, PathBuiltDist, PathSourceDist, RegistryBuiltDist, RegistryBuiltWheel,
RegistrySourceDist, RemoteSource, Resolution, ResolvedDist, ToUrlError, RegistrySourceDist, RemoteSource, Resolution, ResolvedDist, ToUrlError, UrlString,
}; };
use pep440_rs::{Version, VersionSpecifiers}; use pep440_rs::{Version, VersionSpecifiers};
use pep508_rs::{ use pep508_rs::{
@ -716,7 +716,7 @@ impl Distribution {
requires_python: None, requires_python: None,
size: sdist.size(), size: sdist.size(),
upload_time_utc_ms: None, upload_time_utc_ms: None,
url: FileLocation::AbsoluteUrl(file_url.to_string()), url: FileLocation::AbsoluteUrl(file_url.clone().into()),
yanked: None, yanked: None,
}); });
let index = IndexUrl::Url(VerbatimUrl::from_url(url.clone())); let index = IndexUrl::Url(VerbatimUrl::from_url(url.clone()));
@ -1705,7 +1705,7 @@ struct Wheel {
/// against was found. The location does not need to exist in the future, /// against was found. The location does not need to exist in the future,
/// so this should be treated as only a hint to where to look and/or /// so this should be treated as only a hint to where to look and/or
/// recording where the wheel file originally came from. /// recording where the wheel file originally came from.
url: Url, url: UrlString,
/// A hash of the built distribution. /// A hash of the built distribution.
/// ///
/// This is only present for wheels that come from registries and direct /// This is only present for wheels that come from registries and direct
@ -1773,7 +1773,7 @@ impl Wheel {
let url = wheel let url = wheel
.file .file
.url .url
.to_url() .to_url_string()
.map_err(LockErrorKind::InvalidFileUrl) .map_err(LockErrorKind::InvalidFileUrl)
.map_err(LockError::from)?; .map_err(LockError::from)?;
let hash = wheel.file.hashes.first().cloned().map(Hash::from); let hash = wheel.file.hashes.first().cloned().map(Hash::from);
@ -1788,7 +1788,7 @@ impl Wheel {
fn from_direct_dist(direct_dist: &DirectUrlBuiltDist, hashes: &[HashDigest]) -> Wheel { fn from_direct_dist(direct_dist: &DirectUrlBuiltDist, hashes: &[HashDigest]) -> Wheel {
Wheel { Wheel {
url: direct_dist.url.to_url(), url: direct_dist.url.to_url().into(),
hash: hashes.first().cloned().map(Hash::from), hash: hashes.first().cloned().map(Hash::from),
size: None, size: None,
filename: direct_dist.filename.clone(), filename: direct_dist.filename.clone(),
@ -1797,7 +1797,7 @@ impl Wheel {
fn from_path_dist(path_dist: &PathBuiltDist, hashes: &[HashDigest]) -> Wheel { fn from_path_dist(path_dist: &PathBuiltDist, hashes: &[HashDigest]) -> Wheel {
Wheel { Wheel {
url: path_dist.url.to_url(), url: path_dist.url.to_url().into(),
hash: hashes.first().cloned().map(Hash::from), hash: hashes.first().cloned().map(Hash::from),
size: None, size: None,
filename: path_dist.filename.clone(), filename: path_dist.filename.clone(),
@ -1813,7 +1813,7 @@ impl Wheel {
requires_python: None, requires_python: None,
size: self.size, size: self.size,
upload_time_utc_ms: None, upload_time_utc_ms: None,
url: FileLocation::AbsoluteUrl(self.url.to_string()), url: FileLocation::AbsoluteUrl(self.url.clone()),
yanked: None, yanked: None,
}); });
let index = IndexUrl::Url(VerbatimUrl::from_url(url.clone())); let index = IndexUrl::Url(VerbatimUrl::from_url(url.clone()));
@ -1872,7 +1872,7 @@ impl TryFrom<WheelWire> for Wheel {
.map_err(|err| format!("failed to parse `{filename}` as wheel filename: {err}"))?; .map_err(|err| format!("failed to parse `{filename}` as wheel filename: {err}"))?;
Ok(Wheel { Ok(Wheel {
url: wire.url, url: wire.url.into(),
hash: wire.hash, hash: wire.hash,
size: wire.size, size: wire.size,
filename, filename,

View file

@ -19,17 +19,9 @@ Ok(
sdist: None, sdist: None,
wheels: [ wheels: [
Wheel { Wheel {
url: Url { url: UrlString(
scheme: "file", "file:///foo/bar/anyio-4.3.0-py3-none-any.whl",
cannot_be_a_base: false, ),
username: "",
password: None,
host: None,
port: None,
path: "/foo/bar/anyio-4.3.0-py3-none-any.whl",
query: None,
fragment: None,
},
hash: Some( hash: Some(
Hash( Hash(
HashDigest { HashDigest {