Use VerbatimParsedUrl in pep508_rs (#3758)

When parsing requirements from any source, directly parse the url parts
(and reject unsupported urls) instead of parsing url parts at a later
stage. This removes a bunch of error branches and concludes the work
parsing url parts once and passing them around everywhere.

Many usages of the assembled `VerbatimUrl` remain, but these can be
removed incrementally.

Please review commit-by-commit.
This commit is contained in:
konsti 2024-05-23 21:52:47 +02:00 committed by GitHub
parent 0d2f3fc4e4
commit 4db468e27f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
56 changed files with 877 additions and 656 deletions

View file

@ -16,8 +16,11 @@ workspace = true
pep440_rs = { workspace = true }
pep508_rs = { workspace = true }
uv-normalize = { workspace = true }
uv-git = { workspace = true }
anyhow = { workspace = true }
chrono = { workspace = true, features = ["serde"] }
git2 = { workspace = true }
indexmap = { workspace = true, features = ["serde"] }
mailparse = { workspace = true }
once_cell = { workspace = true }

View file

@ -7,7 +7,9 @@ use serde::{de, Deserialize, Deserializer, Serialize};
use tracing::warn;
use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError};
use pep508_rs::{Pep508Error, Pep508Url, Requirement, VerbatimUrl};
use pep508_rs::{Pep508Error, Pep508Url, Requirement};
use crate::VerbatimParsedUrl;
/// Ex) `>=7.2.0<8.0.0`
static MISSING_COMMA: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d)([<>=~^!])").unwrap());
@ -114,7 +116,7 @@ fn parse_with_fixups<Err, T: FromStr<Err = Err>>(input: &str, type_name: &str) -
/// Like [`Requirement`], but attempts to correct some common errors in user-provided requirements.
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub struct LenientRequirement<T: Pep508Url = VerbatimUrl>(Requirement<T>);
pub struct LenientRequirement<T: Pep508Url = VerbatimParsedUrl>(Requirement<T>);
impl<T: Pep508Url> FromStr for LenientRequirement<T> {
type Err = Pep508Error<T>;

View file

@ -2,6 +2,7 @@ pub use base_url::*;
pub use direct_url::*;
pub use lenient_requirement::*;
pub use metadata::*;
pub use parsed_url::*;
pub use scheme::*;
pub use simple_json::*;
@ -9,5 +10,6 @@ mod base_url;
mod direct_url;
mod lenient_requirement;
mod metadata;
mod parsed_url;
mod scheme;
mod simple_json;

View file

@ -9,11 +9,11 @@ use thiserror::Error;
use tracing::warn;
use pep440_rs::{Version, VersionParseError, VersionSpecifiers, VersionSpecifiersParseError};
use pep508_rs::{Pep508Error, Requirement, VerbatimUrl};
use pep508_rs::{Pep508Error, Requirement};
use uv_normalize::{ExtraName, InvalidNameError, PackageName};
use crate::lenient_requirement::LenientRequirement;
use crate::LenientVersionSpecifiers;
use crate::{LenientVersionSpecifiers, VerbatimParsedUrl};
/// Python Package Metadata 2.3 as specified in
/// <https://packaging.python.org/specifications/core-metadata/>.
@ -29,7 +29,7 @@ pub struct Metadata23 {
pub name: PackageName,
pub version: Version,
// Optional fields
pub requires_dist: Vec<Requirement<VerbatimUrl>>,
pub requires_dist: Vec<Requirement<VerbatimParsedUrl>>,
pub requires_python: Option<VersionSpecifiers>,
pub provides_extras: Vec<ExtraName>,
}
@ -50,7 +50,7 @@ pub enum MetadataError {
#[error(transparent)]
Pep440Error(#[from] VersionSpecifiersParseError),
#[error(transparent)]
Pep508Error(#[from] Pep508Error<VerbatimUrl>),
Pep508Error(#[from] Box<Pep508Error<VerbatimParsedUrl>>),
#[error(transparent)]
InvalidName(#[from] InvalidNameError),
#[error("Invalid `Metadata-Version` field: {0}")]
@ -61,6 +61,12 @@ pub enum MetadataError {
DynamicField(&'static str),
}
impl From<Pep508Error<VerbatimParsedUrl>> for MetadataError {
fn from(error: Pep508Error<VerbatimParsedUrl>) -> Self {
Self::Pep508Error(Box::new(error))
}
}
/// From <https://github.com/PyO3/python-pkginfo-rs/blob/d719988323a0cfea86d4737116d7917f30e819e2/src/metadata.rs#LL78C2-L91C26>
impl Metadata23 {
/// Parse the [`Metadata23`] from a `METADATA` file, as included in a built distribution (wheel).

View file

@ -0,0 +1,414 @@
use std::fmt::{Display, Formatter};
use std::path::{Path, PathBuf};
use thiserror::Error;
use url::{ParseError, Url};
use pep508_rs::{Pep508Url, UnnamedRequirementUrl, VerbatimUrl, VerbatimUrlError};
use uv_git::{GitSha, GitUrl};
use crate::{ArchiveInfo, DirInfo, DirectUrl, VcsInfo, VcsKind};
#[derive(Debug, Error)]
pub enum ParsedUrlError {
#[error("Unsupported URL prefix `{prefix}` in URL: `{url}` ({message})")]
UnsupportedUrlPrefix {
prefix: String,
url: Url,
message: &'static str,
},
#[error("Invalid path in file URL: `{0}`")]
InvalidFileUrl(Url),
#[error("Failed to parse Git reference from URL: `{0}`")]
GitShaParse(Url, #[source] git2::Error),
#[error("Not a valid URL: `{0}`")]
UrlParse(String, #[source] ParseError),
#[error(transparent)]
VerbatimUrl(#[from] VerbatimUrlError),
}
#[derive(Debug, Clone, Hash, PartialEq, PartialOrd, Eq, Ord)]
pub struct VerbatimParsedUrl {
pub parsed_url: ParsedUrl,
pub verbatim: VerbatimUrl,
}
impl Pep508Url for VerbatimParsedUrl {
type Err = ParsedUrlError;
fn parse_url(url: &str, working_dir: Option<&Path>) -> Result<Self, Self::Err> {
let verbatim_url = <VerbatimUrl as Pep508Url>::parse_url(url, working_dir)?;
Ok(Self {
parsed_url: ParsedUrl::try_from(verbatim_url.to_url())?,
verbatim: verbatim_url,
})
}
}
impl UnnamedRequirementUrl for VerbatimParsedUrl {
fn parse_path(
path: impl AsRef<Path>,
working_dir: impl AsRef<Path>,
) -> Result<Self, Self::Err> {
let verbatim = VerbatimUrl::parse_path(&path, &working_dir)?;
let parsed_path_url = ParsedPathUrl {
url: verbatim.to_url(),
path: working_dir.as_ref().join(path),
editable: false,
};
Ok(Self {
parsed_url: ParsedUrl::Path(parsed_path_url),
verbatim,
})
}
fn parse_absolute_path(path: impl AsRef<Path>) -> Result<Self, Self::Err> {
let verbatim = VerbatimUrl::parse_absolute_path(&path)?;
let parsed_path_url = ParsedPathUrl {
url: verbatim.to_url(),
path: path.as_ref().to_path_buf(),
editable: false,
};
Ok(Self {
parsed_url: ParsedUrl::Path(parsed_path_url),
verbatim,
})
}
fn parse_unnamed_url(url: impl AsRef<str>) -> Result<Self, Self::Err> {
let verbatim = <VerbatimUrl as UnnamedRequirementUrl>::parse_unnamed_url(&url)?;
Ok(Self {
parsed_url: ParsedUrl::try_from(verbatim.to_url())?,
verbatim,
})
}
fn with_given(self, given: impl Into<String>) -> Self {
Self {
verbatim: self.verbatim.with_given(given),
..self
}
}
fn given(&self) -> Option<&str> {
self.verbatim.given()
}
}
impl Display for VerbatimParsedUrl {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
Display::fmt(&self.verbatim, f)
}
}
impl TryFrom<VerbatimUrl> for VerbatimParsedUrl {
type Error = ParsedUrlError;
fn try_from(verbatim_url: VerbatimUrl) -> Result<Self, Self::Error> {
let parsed_url = ParsedUrl::try_from(verbatim_url.to_url())?;
Ok(Self {
parsed_url,
verbatim: verbatim_url,
})
}
}
impl serde::ser::Serialize for VerbatimParsedUrl {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
self.verbatim.serialize(serializer)
}
}
impl<'de> serde::de::Deserialize<'de> for VerbatimParsedUrl {
fn deserialize<D>(deserializer: D) -> Result<VerbatimParsedUrl, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let verbatim_url = VerbatimUrl::deserialize(deserializer)?;
Self::try_from(verbatim_url).map_err(serde::de::Error::custom)
}
}
/// We support three types of URLs for distributions:
/// * The path to a file or directory (`file://`)
/// * A Git repository (`git+https://` or `git+ssh://`), optionally with a subdirectory and/or
/// string to checkout.
/// * A remote archive (`https://`), optional with a subdirectory (source dist only).
///
/// A URL in a requirement `foo @ <url>` must be one of the above.
#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash, Ord)]
pub enum ParsedUrl {
/// The direct URL is a path to a local directory or file.
Path(ParsedPathUrl),
/// The direct URL is path to a Git repository.
Git(ParsedGitUrl),
/// The direct URL is a URL to a source archive (e.g., a `.tar.gz` file) or built archive
/// (i.e., a `.whl` file).
Archive(ParsedArchiveUrl),
}
/// A local path url
///
/// Examples:
/// * `file:///home/ferris/my_project`
#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash, Ord)]
pub struct ParsedPathUrl {
pub url: Url,
pub path: PathBuf,
pub editable: bool,
}
/// A Git repository URL.
///
/// Examples:
/// * `git+https://git.example.com/MyProject.git`
/// * `git+https://git.example.com/MyProject.git@v1.0#egg=pkg&subdirectory=pkg_dir`
#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash, Ord)]
pub struct ParsedGitUrl {
pub url: GitUrl,
pub subdirectory: Option<PathBuf>,
}
impl TryFrom<Url> for ParsedGitUrl {
type Error = ParsedUrlError;
/// Supports URLS with and without the `git+` prefix.
///
/// When the URL includes a prefix, it's presumed to come from a PEP 508 requirement; when it's
/// excluded, it's presumed to come from `tool.uv.sources`.
fn try_from(url_in: Url) -> Result<Self, Self::Error> {
let subdirectory = get_subdirectory(&url_in);
let url = url_in
.as_str()
.strip_prefix("git+")
.unwrap_or(url_in.as_str());
let url = Url::parse(url).map_err(|err| ParsedUrlError::UrlParse(url.to_string(), err))?;
let url = GitUrl::try_from(url)
.map_err(|err| ParsedUrlError::GitShaParse(url_in.clone(), err))?;
Ok(Self { url, subdirectory })
}
}
/// A URL to a source or built archive.
///
/// Examples:
/// * A built distribution: `https://files.pythonhosted.org/packages/62/06/d5604a70d160f6a6ca5fd2ba25597c24abd5c5ca5f437263d177ac242308/tqdm-4.66.1-py2.py3-none-any.whl`
/// * A source distribution with a valid name: `https://files.pythonhosted.org/packages/62/06/d5604a70d160f6a6ca5fd2ba25597c24abd5c5ca5f437263d177ac242308/tqdm-4.66.1.tar.gz`
/// * A source dist with a recognizable extension but invalid name: `https://github.com/foo-labs/foo/archive/master.zip#egg=pkg&subdirectory=packages/bar`
#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
pub struct ParsedArchiveUrl {
pub url: Url,
pub subdirectory: Option<PathBuf>,
}
impl From<Url> for ParsedArchiveUrl {
fn from(url: Url) -> Self {
let subdirectory = get_subdirectory(&url);
Self { url, subdirectory }
}
}
/// If the URL points to a subdirectory, extract it, as in (git):
/// `git+https://git.example.com/MyProject.git@v1.0#subdirectory=pkg_dir`
/// `git+https://git.example.com/MyProject.git@v1.0#egg=pkg&subdirectory=pkg_dir`
/// or (direct archive url):
/// `https://github.com/foo-labs/foo/archive/master.zip#subdirectory=packages/bar`
/// `https://github.com/foo-labs/foo/archive/master.zip#egg=pkg&subdirectory=packages/bar`
fn get_subdirectory(url: &Url) -> Option<PathBuf> {
let fragment = url.fragment()?;
let subdirectory = fragment
.split('&')
.find_map(|fragment| fragment.strip_prefix("subdirectory="))?;
Some(PathBuf::from(subdirectory))
}
/// Return the Git reference of the given URL, if it exists.
pub fn git_reference(url: Url) -> Result<Option<GitSha>, Box<ParsedUrlError>> {
let ParsedGitUrl { url, .. } = ParsedGitUrl::try_from(url)?;
Ok(url.precise())
}
impl TryFrom<Url> for ParsedUrl {
type Error = ParsedUrlError;
fn try_from(url: Url) -> Result<Self, Self::Error> {
if let Some((prefix, ..)) = url.scheme().split_once('+') {
match prefix {
"git" => Ok(Self::Git(ParsedGitUrl::try_from(url)?)),
"bzr" => Err(ParsedUrlError::UnsupportedUrlPrefix {
prefix: prefix.to_string(),
url: url.clone(),
message: "Bazaar is not supported",
}),
"hg" => Err(ParsedUrlError::UnsupportedUrlPrefix {
prefix: prefix.to_string(),
url: url.clone(),
message: "Mercurial is not supported",
}),
"svn" => Err(ParsedUrlError::UnsupportedUrlPrefix {
prefix: prefix.to_string(),
url: url.clone(),
message: "Subversion is not supported",
}),
_ => Err(ParsedUrlError::UnsupportedUrlPrefix {
prefix: prefix.to_string(),
url: url.clone(),
message: "Unknown scheme",
}),
}
} else if url.scheme().eq_ignore_ascii_case("file") {
let path = url
.to_file_path()
.map_err(|()| ParsedUrlError::InvalidFileUrl(url.clone()))?;
Ok(Self::Path(ParsedPathUrl {
url,
path,
editable: false,
}))
} else {
Ok(Self::Archive(ParsedArchiveUrl::from(url)))
}
}
}
impl TryFrom<&ParsedUrl> for DirectUrl {
type Error = ParsedUrlError;
fn try_from(value: &ParsedUrl) -> Result<Self, Self::Error> {
match value {
ParsedUrl::Path(value) => Self::try_from(value),
ParsedUrl::Git(value) => Self::try_from(value),
ParsedUrl::Archive(value) => Self::try_from(value),
}
}
}
impl TryFrom<&ParsedPathUrl> for DirectUrl {
type Error = ParsedUrlError;
fn try_from(value: &ParsedPathUrl) -> Result<Self, Self::Error> {
Ok(Self::LocalDirectory {
url: value.url.to_string(),
dir_info: DirInfo {
editable: value.editable.then_some(true),
},
})
}
}
impl TryFrom<&ParsedArchiveUrl> for DirectUrl {
type Error = ParsedUrlError;
fn try_from(value: &ParsedArchiveUrl) -> Result<Self, Self::Error> {
Ok(Self::ArchiveUrl {
url: value.url.to_string(),
archive_info: ArchiveInfo {
hash: None,
hashes: None,
},
subdirectory: value.subdirectory.clone(),
})
}
}
impl TryFrom<&ParsedGitUrl> for DirectUrl {
type Error = ParsedUrlError;
fn try_from(value: &ParsedGitUrl) -> Result<Self, Self::Error> {
Ok(Self::VcsUrl {
url: value.url.repository().to_string(),
vcs_info: VcsInfo {
vcs: VcsKind::Git,
commit_id: value.url.precise().as_ref().map(ToString::to_string),
requested_revision: value.url.reference().as_str().map(ToString::to_string),
},
subdirectory: value.subdirectory.clone(),
})
}
}
impl From<ParsedUrl> for Url {
fn from(value: ParsedUrl) -> Self {
match value {
ParsedUrl::Path(value) => value.into(),
ParsedUrl::Git(value) => value.into(),
ParsedUrl::Archive(value) => value.into(),
}
}
}
impl From<ParsedPathUrl> for Url {
fn from(value: ParsedPathUrl) -> Self {
value.url
}
}
impl From<ParsedArchiveUrl> for Url {
fn from(value: ParsedArchiveUrl) -> Self {
let mut url = value.url;
if let Some(subdirectory) = value.subdirectory {
url.set_fragment(Some(&format!("subdirectory={}", subdirectory.display())));
}
url
}
}
impl From<ParsedGitUrl> for Url {
fn from(value: ParsedGitUrl) -> Self {
let mut url = Self::parse(&format!("{}{}", "git+", Self::from(value.url).as_str()))
.expect("Git URL is invalid");
if let Some(subdirectory) = value.subdirectory {
url.set_fragment(Some(&format!("subdirectory={}", subdirectory.display())));
}
url
}
}
#[cfg(test)]
mod tests {
use anyhow::Result;
use url::Url;
use crate::parsed_url::ParsedUrl;
#[test]
fn direct_url_from_url() -> Result<()> {
let expected = Url::parse("git+https://github.com/pallets/flask.git")?;
let actual = Url::from(ParsedUrl::try_from(expected.clone())?);
assert_eq!(expected, actual);
let expected = Url::parse("git+https://github.com/pallets/flask.git#subdirectory=pkg_dir")?;
let actual = Url::from(ParsedUrl::try_from(expected.clone())?);
assert_eq!(expected, actual);
let expected = Url::parse("git+https://github.com/pallets/flask.git@2.0.0")?;
let actual = Url::from(ParsedUrl::try_from(expected.clone())?);
assert_eq!(expected, actual);
let expected =
Url::parse("git+https://github.com/pallets/flask.git@2.0.0#subdirectory=pkg_dir")?;
let actual = Url::from(ParsedUrl::try_from(expected.clone())?);
assert_eq!(expected, actual);
// TODO(charlie): Preserve other fragments.
let expected =
Url::parse("git+https://github.com/pallets/flask.git#egg=flask&subdirectory=pkg_dir")?;
let actual = Url::from(ParsedUrl::try_from(expected.clone())?);
assert_ne!(expected, actual);
Ok(())
}
#[test]
#[cfg(unix)]
fn direct_url_from_url_absolute() -> Result<()> {
let expected = Url::parse("file:///path/to/directory")?;
let actual = Url::from(ParsedUrl::try_from(expected.clone())?);
assert_eq!(expected, actual);
Ok(())
}
}