Use arcstr for package, extra, and group names (#10475)

## Summary

This appears to be a consistent 1% performance improvement and should
also reduce memory quite a bit. We've also decided to use these for
markers, so it's nice to use the same optimization here.

```
❯ hyperfine "./uv pip compile --universal scripts/requirements/airflow.in" "./arcstr pip compile --universal scripts/requirements/airflow.in" --min-runs 50 --warmup 20
Benchmark 1: ./uv pip compile --universal scripts/requirements/airflow.in
  Time (mean ± σ):     136.3 ms ±   4.0 ms    [User: 139.1 ms, System: 241.9 ms]
  Range (min … max):   131.5 ms … 149.5 ms    50 runs

Benchmark 2: ./arcstr pip compile --universal scripts/requirements/airflow.in
  Time (mean ± σ):     134.9 ms ±   3.2 ms    [User: 137.6 ms, System: 239.0 ms]
  Range (min … max):   130.1 ms … 151.8 ms    50 runs

Summary
  ./arcstr pip compile --universal scripts/requirements/airflow.in ran
    1.01 ± 0.04 times faster than ./uv pip compile --universal scripts/requirements/airflow.in
```
This commit is contained in:
Charlie Marsh 2025-01-10 14:46:36 -05:00 committed by GitHub
parent 503f9a97af
commit b3d7beb1a0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 166 additions and 16 deletions

View file

@ -4,6 +4,7 @@ use std::str::FromStr;
use serde::{Deserialize, Deserializer, Serialize};
use crate::small_string::SmallString;
use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNameError};
/// The normalized name of an extra dependency.
@ -14,9 +15,9 @@ use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNam
/// See:
/// - <https://peps.python.org/pep-0685/#specification/>
/// - <https://packaging.python.org/en/latest/specifications/name-normalization/>
#[derive(Debug, Default, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct ExtraName(String);
pub struct ExtraName(SmallString);
impl ExtraName {
/// Create a validated, normalized extra name.

View file

@ -5,6 +5,7 @@ use std::sync::LazyLock;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::small_string::SmallString;
use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNameError};
/// The normalized name of a dependency group.
@ -12,9 +13,9 @@ use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNam
/// See:
/// - <https://peps.python.org/pep-0735/>
/// - <https://packaging.python.org/en/latest/specifications/name-normalization/>
#[derive(Debug, Default, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct GroupName(String);
pub struct GroupName(SmallString);
impl GroupName {
/// Create a validated, normalized group name.

View file

@ -5,26 +5,37 @@ pub use dist_info_name::DistInfoName;
pub use extra_name::ExtraName;
pub use group_name::{GroupName, DEV_DEPENDENCIES};
pub use package_name::PackageName;
use small_string::SmallString;
mod dist_info_name;
mod extra_name;
mod group_name;
mod package_name;
mod small_string;
/// Validate and normalize an owned package or extra name.
pub(crate) fn validate_and_normalize_owned(name: String) -> Result<String, InvalidNameError> {
pub(crate) fn validate_and_normalize_owned(name: String) -> Result<SmallString, InvalidNameError> {
if is_normalized(&name)? {
Ok(name)
Ok(SmallString::from(name))
} else {
validate_and_normalize_ref(name)
Ok(SmallString::from(normalize(&name)?))
}
}
/// Validate and normalize an unowned package or extra name.
pub(crate) fn validate_and_normalize_ref(
name: impl AsRef<str>,
) -> Result<String, InvalidNameError> {
) -> Result<SmallString, InvalidNameError> {
let name = name.as_ref();
if is_normalized(name)? {
Ok(SmallString::from(name))
} else {
Ok(SmallString::from(normalize(name)?))
}
}
/// Normalize an unowned package or extra name.
fn normalize(name: &str) -> Result<String, InvalidNameError> {
let mut normalized = String::with_capacity(name.len());
let mut last = None;
@ -136,9 +147,14 @@ mod tests {
"FrIeNdLy-._.-bArD",
];
for input in inputs {
assert_eq!(validate_and_normalize_ref(input).unwrap(), "friendly-bard");
assert_eq!(
validate_and_normalize_owned(input.to_string()).unwrap(),
validate_and_normalize_ref(input).unwrap().as_ref(),
"friendly-bard"
);
assert_eq!(
validate_and_normalize_owned(input.to_string())
.unwrap()
.as_ref(),
"friendly-bard"
);
}
@ -169,9 +185,11 @@ mod tests {
// Unchanged
let unchanged = ["friendly-bard", "1okay", "okay2"];
for input in unchanged {
assert_eq!(validate_and_normalize_ref(input).unwrap(), input);
assert_eq!(validate_and_normalize_ref(input).unwrap().as_ref(), input);
assert_eq!(
validate_and_normalize_owned(input.to_string()).unwrap(),
validate_and_normalize_owned(input.to_string())
.unwrap()
.as_ref(),
input
);
assert!(is_normalized(input).unwrap());

View file

@ -1,8 +1,10 @@
use std::borrow::Cow;
use std::cmp::PartialEq;
use std::str::FromStr;
use serde::{Deserialize, Deserializer, Serialize};
use crate::small_string::SmallString;
use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNameError};
/// The normalized name of a package.
@ -13,7 +15,6 @@ use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNam
/// See: <https://packaging.python.org/en/latest/specifications/name-normalization/>
#[derive(
Debug,
Default,
Clone,
PartialEq,
Eq,
@ -27,7 +28,7 @@ use crate::{validate_and_normalize_owned, validate_and_normalize_ref, InvalidNam
)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
#[rkyv(derive(Debug))]
pub struct PackageName(String);
pub struct PackageName(SmallString);
impl PackageName {
/// Create a validated, normalized package name.
@ -56,7 +57,7 @@ impl PackageName {
Cow::Owned(owned_string)
} else {
Cow::Borrowed(self.0.as_str())
Cow::Borrowed(self.0.as_ref())
}
}

View file

@ -0,0 +1,119 @@
use std::cmp::PartialEq;
use std::ops::Deref;
/// An optimized small string type for short identifiers, like package names.
///
/// Represented as an [`arcstr::ArcStr`] internally.
#[derive(Default, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub(crate) struct SmallString(arcstr::ArcStr);
impl From<&str> for SmallString {
#[inline]
fn from(s: &str) -> Self {
Self(s.into())
}
}
impl From<String> for SmallString {
#[inline]
fn from(s: String) -> Self {
Self(s.into())
}
}
impl AsRef<str> for SmallString {
#[inline]
fn as_ref(&self) -> &str {
&self.0
}
}
impl Deref for SmallString {
type Target = str;
#[inline]
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl core::fmt::Debug for SmallString {
#[inline]
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(&self.0, f)
}
}
impl core::fmt::Display for SmallString {
#[inline]
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(&self.0, f)
}
}
/// A [`serde::Serialize`] implementation for [`SmallString`].
impl serde::Serialize for SmallString {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.0.serialize(serializer)
}
}
/// An [`rkyv`] implementation for [`SmallString`].
impl rkyv::Archive for SmallString {
type Archived = rkyv::string::ArchivedString;
type Resolver = rkyv::string::StringResolver;
#[inline]
fn resolve(&self, resolver: Self::Resolver, out: rkyv::Place<Self::Archived>) {
rkyv::string::ArchivedString::resolve_from_str(&self.0, resolver, out);
}
}
impl<S> rkyv::Serialize<S> for SmallString
where
S: rkyv::rancor::Fallible + rkyv::ser::Allocator + rkyv::ser::Writer + ?Sized,
S::Error: rkyv::rancor::Source,
{
fn serialize(&self, serializer: &mut S) -> Result<Self::Resolver, S::Error> {
rkyv::string::ArchivedString::serialize_from_str(&self.0, serializer)
}
}
impl<D: rkyv::rancor::Fallible + ?Sized> rkyv::Deserialize<SmallString, D>
for rkyv::string::ArchivedString
{
fn deserialize(&self, _deserializer: &mut D) -> Result<SmallString, D::Error> {
Ok(SmallString::from(self.as_str()))
}
}
impl PartialEq<SmallString> for rkyv::string::ArchivedString {
fn eq(&self, other: &SmallString) -> bool {
**other == **self
}
}
impl PartialOrd<SmallString> for rkyv::string::ArchivedString {
fn partial_cmp(&self, other: &SmallString) -> Option<::core::cmp::Ordering> {
Some(self.as_str().cmp(other))
}
}
/// An [`schemars::JsonSchema`] implementation for [`SmallString`].
#[cfg(feature = "schemars")]
impl schemars::JsonSchema for SmallString {
fn is_referenceable() -> bool {
String::is_referenceable()
}
fn schema_name() -> String {
String::schema_name()
}
fn json_schema(_gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema {
String::json_schema(_gen)
}
}