mirror of
https://github.com/astral-sh/uv.git
synced 2025-07-07 21:35:00 +00:00
Avoid storing absolute URLs for files (#944)
## Summary It turns out that storing an absolute URL for every file caused a significant performance regression. This PR attempts to address the regression with two changes. The first is that we now store the raw string if the URL is an absolute URL. If the URL is relative, we store the base URL alongside the raw relative string. As such, we avoid serializing and deserializing URLs until we need them (later on), except for the base URL. The second is that we now use the internal `Url` crate methods for serializing and deserializing. If you look inside `Url`, its standard serializer and deserialization actually convert it to a string, then parse the string. But the crate exposes some other methods for faster serialization and deserialization (with fewer guarantees). I think this is totally fine since the cache is entirely internal. If we _just_ change the `Url` serialization (and no other code -- so continue to store URLs for every file), then the regression goes down to about 5%: ```shell ❯ python -m scripts.bench \ --puffin-path ./target/release/main \ --puffin-path ./target/release/relative --puffin-path ./target/release/puffin \ scripts/requirements/home-assistant.in --benchmark resolve-warm Benchmark 1: ./target/release/main (resolve-warm) Time (mean ± σ): 496.3 ms ± 4.3 ms [User: 452.4 ms, System: 175.5 ms] Range (min … max): 487.3 ms … 502.4 ms 10 runs Benchmark 2: ./target/release/relative (resolve-warm) Time (mean ± σ): 284.8 ms ± 2.1 ms [User: 245.8 ms, System: 165.6 ms] Range (min … max): 280.3 ms … 288.0 ms 10 runs Benchmark 3: ./target/release/puffin (resolve-warm) Time (mean ± σ): 300.4 ms ± 3.2 ms [User: 255.5 ms, System: 178.1 ms] Range (min … max): 295.4 ms … 305.1 ms 10 runs Summary './target/release/relative (resolve-warm)' ran 1.05 ± 0.01 times faster than './target/release/puffin (resolve-warm)' 1.74 ± 0.02 times faster than './target/release/main (resolve-warm)' ``` So I considered _just_ making that change. But 5% is kind of borderline... With both of these changes, the regression is down to 1-2%: ``` Benchmark 1: ./target/release/relative (resolve-warm) Time (mean ± σ): 282.6 ms ± 7.4 ms [User: 244.6 ms, System: 181.3 ms] Range (min … max): 275.1 ms … 318.5 ms 30 runs Benchmark 2: ./target/release/puffin (resolve-warm) Time (mean ± σ): 286.8 ms ± 2.2 ms [User: 247.0 ms, System: 169.1 ms] Range (min … max): 282.3 ms … 290.7 ms 30 runs Summary './target/release/relative (resolve-warm)' ran 1.01 ± 0.03 times faster than './target/release/puffin (resolve-warm)' ``` It's consistently ~2%-ish, but at this point it's unclear if that's due to the URL change or something other change between now and then. Closes #943.
This commit is contained in:
parent
b8fbd529a1
commit
a0420114c3
8 changed files with 94 additions and 18 deletions
|
@ -8,6 +8,7 @@ use std::num::{
|
|||
use std::path::{Path, PathBuf};
|
||||
|
||||
use seahash::SeaHasher;
|
||||
use url::Url;
|
||||
|
||||
/// A trait for types that can be hashed in a stable way across versions and platforms. Equivalent
|
||||
/// to Ruff's [`CacheKey`] trait.
|
||||
|
@ -211,6 +212,13 @@ impl CacheKey for PathBuf {
|
|||
}
|
||||
}
|
||||
|
||||
impl CacheKey for Url {
|
||||
#[inline]
|
||||
fn cache_key(&self, state: &mut CacheKeyHasher) {
|
||||
self.as_str().cache_key(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: CacheKey> CacheKey for Option<T> {
|
||||
#[inline]
|
||||
fn cache_key(&self, state: &mut CacheKeyHasher) {
|
||||
|
|
|
@ -4,7 +4,6 @@ use std::path::PathBuf;
|
|||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError};
|
||||
use pep508_rs::VerbatimUrl;
|
||||
|
@ -42,10 +41,11 @@ impl File {
|
|||
requires_python: file.requires_python.transpose()?,
|
||||
size: file.size,
|
||||
upload_time: file.upload_time,
|
||||
url: FileLocation::Url(
|
||||
base.join_relative(&file.url)
|
||||
.map_err(|err| FileConversionError::Url(file.url.clone(), err))?,
|
||||
),
|
||||
url: if file.url.contains("://") {
|
||||
FileLocation::AbsoluteUrl(file.url)
|
||||
} else {
|
||||
FileLocation::RelativeUrl(base.clone(), file.url)
|
||||
},
|
||||
yanked: file.yanked,
|
||||
})
|
||||
}
|
||||
|
@ -55,7 +55,9 @@ impl File {
|
|||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum FileLocation {
|
||||
/// URL relative to the base URL.
|
||||
Url(Url),
|
||||
RelativeUrl(BaseUrl, String),
|
||||
/// Absolute URL.
|
||||
AbsoluteUrl(String),
|
||||
/// Absolute path to a file.
|
||||
Path(PathBuf, VerbatimUrl),
|
||||
}
|
||||
|
@ -63,7 +65,8 @@ pub enum FileLocation {
|
|||
impl Display for FileLocation {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
FileLocation::Url(url) => Display::fmt(url, f),
|
||||
FileLocation::RelativeUrl(_base, url) => Display::fmt(&url, f),
|
||||
FileLocation::AbsoluteUrl(url) => Display::fmt(&url, f),
|
||||
FileLocation::Path(path, _url) => Display::fmt(&path.display(), f),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -694,17 +694,49 @@ impl Identifier for Path {
|
|||
}
|
||||
}
|
||||
|
||||
impl Identifier for String {
|
||||
fn distribution_id(&self) -> DistributionId {
|
||||
DistributionId::new(cache_key::digest(&self))
|
||||
}
|
||||
|
||||
fn resource_id(&self) -> ResourceId {
|
||||
ResourceId::new(cache_key::digest(&self))
|
||||
}
|
||||
}
|
||||
|
||||
impl Identifier for &str {
|
||||
fn distribution_id(&self) -> DistributionId {
|
||||
DistributionId::new(cache_key::digest(&self))
|
||||
}
|
||||
|
||||
fn resource_id(&self) -> ResourceId {
|
||||
ResourceId::new(cache_key::digest(&self))
|
||||
}
|
||||
}
|
||||
|
||||
impl Identifier for (&Url, &str) {
|
||||
fn distribution_id(&self) -> DistributionId {
|
||||
DistributionId::new(cache_key::digest(&self))
|
||||
}
|
||||
|
||||
fn resource_id(&self) -> ResourceId {
|
||||
ResourceId::new(cache_key::digest(&self))
|
||||
}
|
||||
}
|
||||
|
||||
impl Identifier for FileLocation {
|
||||
fn distribution_id(&self) -> DistributionId {
|
||||
match self {
|
||||
FileLocation::Url(url) => url.distribution_id(),
|
||||
FileLocation::RelativeUrl(base, url) => (base.as_url(), url.as_str()).distribution_id(),
|
||||
FileLocation::AbsoluteUrl(url) => url.distribution_id(),
|
||||
FileLocation::Path(path, _) => path.distribution_id(),
|
||||
}
|
||||
}
|
||||
|
||||
fn resource_id(&self) -> ResourceId {
|
||||
match self {
|
||||
FileLocation::Url(url) => url.resource_id(),
|
||||
FileLocation::RelativeUrl(base, url) => (base.as_url(), url.as_str()).resource_id(),
|
||||
FileLocation::AbsoluteUrl(url) => url.resource_id(),
|
||||
FileLocation::Path(path, _) => path.resource_id(),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,6 +13,10 @@ use url::Url;
|
|||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
pub struct VerbatimUrl {
|
||||
/// The parsed URL.
|
||||
#[serde(
|
||||
serialize_with = "Url::serialize_internal",
|
||||
deserialize_with = "Url::deserialize_internal"
|
||||
)]
|
||||
url: Url,
|
||||
/// The URL as it was provided by the user.
|
||||
#[derivative(PartialEq = "ignore")]
|
||||
|
|
|
@ -225,8 +225,14 @@ impl RegistryClient {
|
|||
pub async fn wheel_metadata(&self, built_dist: &BuiltDist) -> Result<Metadata21, Error> {
|
||||
let metadata = match &built_dist {
|
||||
BuiltDist::Registry(wheel) => match &wheel.file.url {
|
||||
FileLocation::Url(url) => {
|
||||
self.wheel_metadata_registry(&wheel.index, &wheel.file, url)
|
||||
FileLocation::RelativeUrl(base, url) => {
|
||||
let url = base.join_relative(url)?;
|
||||
self.wheel_metadata_registry(&wheel.index, &wheel.file, &url)
|
||||
.await?
|
||||
}
|
||||
FileLocation::AbsoluteUrl(url) => {
|
||||
let url = Url::parse(url)?;
|
||||
self.wheel_metadata_registry(&wheel.index, &wheel.file, &url)
|
||||
.await?
|
||||
}
|
||||
FileLocation::Path(path, _url) => {
|
||||
|
|
|
@ -111,7 +111,11 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
|
|||
match &dist {
|
||||
Dist::Built(BuiltDist::Registry(wheel)) => {
|
||||
let url = match &wheel.file.url {
|
||||
FileLocation::Url(url) => url,
|
||||
FileLocation::RelativeUrl(base, url) => base
|
||||
.join_relative(url)
|
||||
.map_err(|err| DistributionDatabaseError::Url(url.clone(), err))?,
|
||||
FileLocation::AbsoluteUrl(url) => Url::parse(url)
|
||||
.map_err(|err| DistributionDatabaseError::Url(url.clone(), err))?,
|
||||
FileLocation::Path(path, url) => {
|
||||
let cache_entry = self.cache.entry(
|
||||
CacheBucket::Wheels,
|
||||
|
@ -143,7 +147,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
|
|||
// for downloading and unzipping (with a buffer in between) and switch
|
||||
// to rayon if this buffer grows large by the time the file is fully
|
||||
// downloaded.
|
||||
let reader = self.client.stream_external(url).await?;
|
||||
let reader = self.client.stream_external(&url).await?;
|
||||
|
||||
// Download and unzip the wheel to a temporary directory.
|
||||
let temp_dir = tempfile::tempdir_in(self.cache.root())?;
|
||||
|
|
|
@ -99,7 +99,11 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
|
|||
}
|
||||
SourceDist::Registry(registry_source_dist) => {
|
||||
let url = match ®istry_source_dist.file.url {
|
||||
FileLocation::Url(url) => url,
|
||||
FileLocation::RelativeUrl(base, url) => base
|
||||
.join_relative(url)
|
||||
.map_err(|err| SourceDistError::UrlParse(url.clone(), err))?,
|
||||
FileLocation::AbsoluteUrl(url) => Url::parse(url)
|
||||
.map_err(|err| SourceDistError::UrlParse(url.clone(), err))?,
|
||||
FileLocation::Path(path, url) => {
|
||||
let path_source_dist = PathSourceDist {
|
||||
name: registry_source_dist.filename.name.clone(),
|
||||
|
@ -123,7 +127,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
|
|||
self.url(
|
||||
source_dist,
|
||||
®istry_source_dist.file.filename,
|
||||
url,
|
||||
&url,
|
||||
&cache_shard,
|
||||
None,
|
||||
)
|
||||
|
@ -168,7 +172,11 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
|
|||
}
|
||||
SourceDist::Registry(registry_source_dist) => {
|
||||
let url = match ®istry_source_dist.file.url {
|
||||
FileLocation::Url(url) => url,
|
||||
FileLocation::RelativeUrl(base, url) => base
|
||||
.join_relative(url)
|
||||
.map_err(|err| SourceDistError::UrlParse(url.clone(), err))?,
|
||||
FileLocation::AbsoluteUrl(url) => Url::parse(url)
|
||||
.map_err(|err| SourceDistError::UrlParse(url.clone(), err))?,
|
||||
FileLocation::Path(path, url) => {
|
||||
let path_source_dist = PathSourceDist {
|
||||
name: registry_source_dist.filename.name.clone(),
|
||||
|
@ -192,7 +200,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
|
|||
self.url_metadata(
|
||||
source_dist,
|
||||
®istry_source_dist.file.filename,
|
||||
url,
|
||||
&url,
|
||||
&cache_shard,
|
||||
None,
|
||||
)
|
||||
|
|
|
@ -2,7 +2,13 @@ use serde::{Deserialize, Serialize};
|
|||
use url::Url;
|
||||
|
||||
#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)]
|
||||
pub struct BaseUrl(Url);
|
||||
pub struct BaseUrl(
|
||||
#[serde(
|
||||
serialize_with = "Url::serialize_internal",
|
||||
deserialize_with = "Url::deserialize_internal"
|
||||
)]
|
||||
Url,
|
||||
);
|
||||
|
||||
impl BaseUrl {
|
||||
/// Parse the given URL. If it's relative, join it to the current [`BaseUrl`]. Allows for
|
||||
|
@ -19,6 +25,11 @@ impl BaseUrl {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the underlying [`Url`].
|
||||
pub fn as_url(&self) -> &Url {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Url> for BaseUrl {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue