mirror of
https://github.com/astral-sh/uv.git
synced 2025-08-04 19:08:04 +00:00

## Summary It turns out that storing an absolute URL for every file caused a significant performance regression. This PR attempts to address the regression with two changes. The first is that we now store the raw string if the URL is an absolute URL. If the URL is relative, we store the base URL alongside the raw relative string. As such, we avoid serializing and deserializing URLs until we need them (later on), except for the base URL. The second is that we now use the internal `Url` crate methods for serializing and deserializing. If you look inside `Url`, its standard serializer and deserialization actually convert it to a string, then parse the string. But the crate exposes some other methods for faster serialization and deserialization (with fewer guarantees). I think this is totally fine since the cache is entirely internal. If we _just_ change the `Url` serialization (and no other code -- so continue to store URLs for every file), then the regression goes down to about 5%: ```shell ❯ python -m scripts.bench \ --puffin-path ./target/release/main \ --puffin-path ./target/release/relative --puffin-path ./target/release/puffin \ scripts/requirements/home-assistant.in --benchmark resolve-warm Benchmark 1: ./target/release/main (resolve-warm) Time (mean ± σ): 496.3 ms ± 4.3 ms [User: 452.4 ms, System: 175.5 ms] Range (min … max): 487.3 ms … 502.4 ms 10 runs Benchmark 2: ./target/release/relative (resolve-warm) Time (mean ± σ): 284.8 ms ± 2.1 ms [User: 245.8 ms, System: 165.6 ms] Range (min … max): 280.3 ms … 288.0 ms 10 runs Benchmark 3: ./target/release/puffin (resolve-warm) Time (mean ± σ): 300.4 ms ± 3.2 ms [User: 255.5 ms, System: 178.1 ms] Range (min … max): 295.4 ms … 305.1 ms 10 runs Summary './target/release/relative (resolve-warm)' ran 1.05 ± 0.01 times faster than './target/release/puffin (resolve-warm)' 1.74 ± 0.02 times faster than './target/release/main (resolve-warm)' ``` So I considered _just_ making that change. But 5% is kind of borderline... With both of these changes, the regression is down to 1-2%: ``` Benchmark 1: ./target/release/relative (resolve-warm) Time (mean ± σ): 282.6 ms ± 7.4 ms [User: 244.6 ms, System: 181.3 ms] Range (min … max): 275.1 ms … 318.5 ms 30 runs Benchmark 2: ./target/release/puffin (resolve-warm) Time (mean ± σ): 286.8 ms ± 2.2 ms [User: 247.0 ms, System: 169.1 ms] Range (min … max): 282.3 ms … 290.7 ms 30 runs Summary './target/release/relative (resolve-warm)' ran 1.01 ± 0.03 times faster than './target/release/puffin (resolve-warm)' ``` It's consistently ~2%-ish, but at this point it's unclear if that's due to the URL change or something other change between now and then. Closes #943.
131 lines
3.8 KiB
Rust
131 lines
3.8 KiB
Rust
use std::borrow::Cow;
|
|
use std::ops::Deref;
|
|
use std::path::Path;
|
|
|
|
use once_cell::sync::Lazy;
|
|
use regex::Regex;
|
|
use serde::{Deserialize, Serialize};
|
|
use url::Url;
|
|
|
|
/// A wrapper around [`Url`] that preserves the original string.
|
|
#[derive(Debug, Clone, Eq, derivative::Derivative)]
|
|
#[derivative(PartialEq, Hash)]
|
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
pub struct VerbatimUrl {
|
|
/// The parsed URL.
|
|
#[serde(
|
|
serialize_with = "Url::serialize_internal",
|
|
deserialize_with = "Url::deserialize_internal"
|
|
)]
|
|
url: Url,
|
|
/// The URL as it was provided by the user.
|
|
#[derivative(PartialEq = "ignore")]
|
|
#[derivative(Hash = "ignore")]
|
|
given: Option<String>,
|
|
}
|
|
|
|
impl VerbatimUrl {
|
|
/// Parse a URL from a string, expanding any environment variables.
|
|
pub fn parse(given: String) -> Result<Self, Error> {
|
|
let url = Url::parse(&expand_env_vars(&given))?;
|
|
Ok(Self {
|
|
given: Some(given),
|
|
url,
|
|
})
|
|
}
|
|
|
|
/// Parse a URL from a path.
|
|
#[allow(clippy::result_unit_err)]
|
|
pub fn from_path(path: impl AsRef<Path>, given: String) -> Result<Self, ()> {
|
|
Ok(Self {
|
|
url: Url::from_directory_path(path)?,
|
|
given: Some(given),
|
|
})
|
|
}
|
|
|
|
/// Return the original string as given by the user, if available.
|
|
pub fn given(&self) -> Option<&str> {
|
|
self.given.as_deref()
|
|
}
|
|
|
|
/// Return the underlying [`Url`].
|
|
pub fn raw(&self) -> &Url {
|
|
&self.url
|
|
}
|
|
|
|
/// Convert a [`VerbatimUrl`] into a [`Url`].
|
|
pub fn to_url(&self) -> Url {
|
|
self.url.clone()
|
|
}
|
|
|
|
/// Create a [`VerbatimUrl`] from a [`Url`].
|
|
///
|
|
/// This method should be used sparingly (ideally, not at all), as it represents a loss of the
|
|
/// verbatim representation.
|
|
pub fn unknown(url: Url) -> Self {
|
|
Self { given: None, url }
|
|
}
|
|
}
|
|
|
|
impl std::str::FromStr for VerbatimUrl {
|
|
type Err = Error;
|
|
|
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
Self::parse(s.to_owned())
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Display for VerbatimUrl {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
self.url.fmt(f)
|
|
}
|
|
}
|
|
|
|
impl Deref for VerbatimUrl {
|
|
type Target = Url;
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
&self.url
|
|
}
|
|
}
|
|
|
|
#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
|
|
pub enum Error {
|
|
#[error(transparent)]
|
|
Url(#[from] url::ParseError),
|
|
}
|
|
|
|
/// Expand all available environment variables.
|
|
///
|
|
/// This is modeled off of pip's environment variable expansion, which states:
|
|
///
|
|
/// The only allowed format for environment variables defined in the
|
|
/// requirement file is `${MY_VARIABLE_1}` to ensure two things:
|
|
///
|
|
/// 1. Strings that contain a `$` aren't accidentally (partially) expanded.
|
|
/// 2. Ensure consistency across platforms for requirement files.
|
|
///
|
|
/// ...
|
|
///
|
|
/// Valid characters in variable names follow the `POSIX standard
|
|
/// <http://pubs.opengroup.org/onlinepubs/9699919799/>`_ and are limited
|
|
/// to uppercase letter, digits and the `_` (underscore).
|
|
fn expand_env_vars(s: &str) -> Cow<'_, str> {
|
|
// Generate a URL-escaped project root, to be used via the `${PROJECT_ROOT}`
|
|
// environment variable. Ensure that it's URL-escaped.
|
|
static PROJECT_ROOT_FRAGMENT: Lazy<String> = Lazy::new(|| {
|
|
let project_root = std::env::current_dir().unwrap();
|
|
project_root.to_string_lossy().replace(' ', "%20")
|
|
});
|
|
|
|
static RE: Lazy<Regex> =
|
|
Lazy::new(|| Regex::new(r"(?P<var>\$\{(?P<name>[A-Z0-9_]+)})").unwrap());
|
|
|
|
RE.replace_all(s, |caps: ®ex::Captures<'_>| {
|
|
let name = caps.name("name").unwrap().as_str();
|
|
std::env::var(name).unwrap_or_else(|_| match name {
|
|
"PROJECT_ROOT" => PROJECT_ROOT_FRAGMENT.clone(),
|
|
_ => caps["var"].to_owned(),
|
|
})
|
|
})
|
|
}
|