mirror of
https://github.com/astral-sh/uv.git
synced 2025-08-03 10:33:49 +00:00
Add support for HTML indexes (#719)
## Summary This PR adds support for HTML index responses (as with `--index-url=https://download.pytorch.org/whl`). Closes https://github.com/astral-sh/puffin/issues/412.
This commit is contained in:
parent
9e6cb706a0
commit
5bce699ee1
11 changed files with 465 additions and 19 deletions
8
Cargo.lock
generated
8
Cargo.lock
generated
|
@ -2373,6 +2373,7 @@ dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"http",
|
"http",
|
||||||
"http-cache-semantics",
|
"http-cache-semantics",
|
||||||
|
"insta",
|
||||||
"install-wheel-rs",
|
"install-wheel-rs",
|
||||||
"pep440_rs 0.3.12",
|
"pep440_rs 0.3.12",
|
||||||
"pep508_rs",
|
"pep508_rs",
|
||||||
|
@ -2389,6 +2390,7 @@ dependencies = [
|
||||||
"sha2",
|
"sha2",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
"tl",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
@ -3701,6 +3703,12 @@ version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tl"
|
||||||
|
version = "0.7.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d5e993a1c7c32fdf90a308cec4d457f507b2573acc909bd6e7a092321664fdb3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio"
|
name = "tokio"
|
||||||
version = "1.33.0"
|
version = "1.33.0"
|
||||||
|
|
|
@ -75,6 +75,7 @@ target-lexicon = { version = "0.12.12" }
|
||||||
tempfile = { version = "3.8.1" }
|
tempfile = { version = "3.8.1" }
|
||||||
textwrap = { version = "0.15.2" }
|
textwrap = { version = "0.15.2" }
|
||||||
thiserror = { version = "1.0.50" }
|
thiserror = { version = "1.0.50" }
|
||||||
|
tl = { version = "0.7.7" }
|
||||||
tokio = { version = "1.33.0", features = ["rt-multi-thread"] }
|
tokio = { version = "1.33.0", features = ["rt-multi-thread"] }
|
||||||
tokio-util = { version = "0.7.10", features = ["compat"] }
|
tokio-util = { version = "0.7.10", features = ["compat"] }
|
||||||
toml = { version = "0.8.6" }
|
toml = { version = "0.8.6" }
|
||||||
|
|
|
@ -8,7 +8,8 @@ use thiserror::Error;
|
||||||
use pep440_rs::Version;
|
use pep440_rs::Version;
|
||||||
use puffin_normalize::{InvalidNameError, PackageName};
|
use puffin_normalize::{InvalidNameError, PackageName};
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||||
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||||
pub enum SourceDistExtension {
|
pub enum SourceDistExtension {
|
||||||
Zip,
|
Zip,
|
||||||
TarGz,
|
TarGz,
|
||||||
|
|
|
@ -2701,3 +2701,42 @@ fn cache_errors_are_non_fatal() -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Resolve a distribution from an HTML-only registry.
|
||||||
|
#[test]
|
||||||
|
fn compile_html() -> Result<()> {
|
||||||
|
let temp_dir = TempDir::new()?;
|
||||||
|
let cache_dir = TempDir::new()?;
|
||||||
|
let venv = create_venv_py312(&temp_dir, &cache_dir);
|
||||||
|
|
||||||
|
let requirements_in = temp_dir.child("requirements.in");
|
||||||
|
requirements_in.write_str("jinja2<=3.1.2")?;
|
||||||
|
|
||||||
|
insta::with_settings!({
|
||||||
|
filters => INSTA_FILTERS.to_vec()
|
||||||
|
}, {
|
||||||
|
assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME))
|
||||||
|
.arg("pip-compile")
|
||||||
|
.arg("requirements.in")
|
||||||
|
.arg("--cache-dir")
|
||||||
|
.arg(cache_dir.path())
|
||||||
|
.arg("--index-url")
|
||||||
|
.arg("https://download.pytorch.org/whl")
|
||||||
|
.env("VIRTUAL_ENV", venv.as_os_str())
|
||||||
|
.current_dir(&temp_dir), @r###"
|
||||||
|
success: true
|
||||||
|
exit_code: 0
|
||||||
|
----- stdout -----
|
||||||
|
# This file was autogenerated by Puffin v0.0.1 via the following command:
|
||||||
|
# puffin pip-compile requirements.in --cache-dir [CACHE_DIR]
|
||||||
|
jinja2==3.1.2
|
||||||
|
markupsafe==2.1.3
|
||||||
|
# via jinja2
|
||||||
|
|
||||||
|
----- stderr -----
|
||||||
|
Resolved 2 packages in [TIME]
|
||||||
|
"###);
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ version = "0.0.1"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
distribution-filename = { path = "../distribution-filename" }
|
distribution-filename = { path = "../distribution-filename", features = ["serde"] }
|
||||||
distribution-types = { path = "../distribution-types" }
|
distribution-types = { path = "../distribution-types" }
|
||||||
install-wheel-rs = { path = "../install-wheel-rs" }
|
install-wheel-rs = { path = "../install-wheel-rs" }
|
||||||
pep440_rs = { path = "../pep440-rs" }
|
pep440_rs = { path = "../pep440-rs" }
|
||||||
|
@ -28,6 +28,7 @@ serde_json = { workspace = true }
|
||||||
sha2 = { workspace = true }
|
sha2 = { workspace = true }
|
||||||
tempfile = { workspace = true }
|
tempfile = { workspace = true }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
|
tl = { workspace = true }
|
||||||
tokio = { workspace = true, features = ["fs"] }
|
tokio = { workspace = true, features = ["fs"] }
|
||||||
tokio-util = { workspace = true }
|
tokio-util = { workspace = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
|
@ -37,4 +38,5 @@ url = { workspace = true }
|
||||||
pep508_rs = { path = "../pep508-rs" }
|
pep508_rs = { path = "../pep508-rs" }
|
||||||
|
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
|
insta = { version = "1.34.0" }
|
||||||
tokio = { workspace = true, features = ["fs", "macros"] }
|
tokio = { workspace = true, features = ["fs", "macros"] }
|
||||||
|
|
|
@ -5,6 +5,7 @@ use async_zip::error::ZipError;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::html;
|
||||||
use distribution_filename::{WheelFilename, WheelFilenameError};
|
use distribution_filename::{WheelFilename, WheelFilenameError};
|
||||||
use puffin_normalize::PackageName;
|
use puffin_normalize::PackageName;
|
||||||
|
|
||||||
|
@ -49,6 +50,9 @@ pub enum Error {
|
||||||
#[error("Received some unexpected JSON from {url}")]
|
#[error("Received some unexpected JSON from {url}")]
|
||||||
BadJson { source: serde_json::Error, url: Url },
|
BadJson { source: serde_json::Error, url: Url },
|
||||||
|
|
||||||
|
#[error("Received some unexpected HTML from {url}")]
|
||||||
|
BadHtml { source: html::Error, url: Url },
|
||||||
|
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
AsyncHttpRangeReader(#[from] AsyncHttpRangeReaderError),
|
AsyncHttpRangeReader(#[from] AsyncHttpRangeReaderError),
|
||||||
|
|
||||||
|
@ -82,10 +86,23 @@ pub enum Error {
|
||||||
/// An [`io::Error`] with a filename attached
|
/// An [`io::Error`] with a filename attached
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Persist(#[from] tempfile::PersistError),
|
Persist(#[from] tempfile::PersistError),
|
||||||
|
|
||||||
|
#[error("Missing `Content-Type` header for {0}")]
|
||||||
|
MissingContentType(Url),
|
||||||
|
|
||||||
|
#[error("Invalid `Content-Type` header for {0}")]
|
||||||
|
InvalidContentTypeHeader(Url, #[source] http::header::ToStrError),
|
||||||
|
|
||||||
|
#[error("Unsupported `Content-Type` \"{1}\" for {0}")]
|
||||||
|
UnsupportedMediaType(Url, String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Error {
|
impl Error {
|
||||||
pub fn from_json_err(err: serde_json::Error, url: Url) -> Self {
|
pub(crate) fn from_json_err(err: serde_json::Error, url: Url) -> Self {
|
||||||
Self::BadJson { source: err, url }
|
Self::BadJson { source: err, url }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn from_html_err(err: html::Error, url: Url) -> Self {
|
||||||
|
Self::BadHtml { source: err, url }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
332
crates/puffin-client/src/html.rs
Normal file
332
crates/puffin-client/src/html.rs
Normal file
|
@ -0,0 +1,332 @@
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use tl::HTMLTag;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use pep440_rs::VersionSpecifiers;
|
||||||
|
use pypi_types::{DistInfoMetadata, File, Hashes, Yanked};
|
||||||
|
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum Error {
|
||||||
|
#[error(transparent)]
|
||||||
|
Utf8(#[from] std::str::Utf8Error),
|
||||||
|
|
||||||
|
#[error(transparent)]
|
||||||
|
UrlParse(#[from] url::ParseError),
|
||||||
|
|
||||||
|
#[error("Missing href attribute on URL: {0}")]
|
||||||
|
MissingHref(Url),
|
||||||
|
|
||||||
|
#[error("Expected distribution filename as last path component of URL: {0}")]
|
||||||
|
MissingFilename(Url),
|
||||||
|
|
||||||
|
#[error("Missing hash attribute on URL: {0}")]
|
||||||
|
MissingHash(Url),
|
||||||
|
|
||||||
|
#[error("Unexpected fragment (expected `#sha256=...`) on URL: {0}")]
|
||||||
|
FragmentParse(Url),
|
||||||
|
|
||||||
|
#[error("Unsupported hash algorithm (expected `sha256`) on: {0}")]
|
||||||
|
UnsupportedHashAlgorithm(Url),
|
||||||
|
|
||||||
|
#[error("Invalid `requires-python` specifier: {0}")]
|
||||||
|
Pep440(#[source] pep440_rs::Pep440Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse the list of [`File`]s from the simple HTML page returned by the given URL.
|
||||||
|
pub(crate) fn parse_simple(text: &str, base: &Url) -> Result<Vec<File>, Error> {
|
||||||
|
let dom = tl::parse(text, tl::ParserOptions::default()).unwrap();
|
||||||
|
|
||||||
|
// Parse the first `<base>` tag, if any, to determine the base URL to which all
|
||||||
|
// relative URLs should be resolved. The HTML spec requires that the `<base>` tag
|
||||||
|
// appear before other tags with attribute values of URLs.
|
||||||
|
let base = dom
|
||||||
|
.nodes()
|
||||||
|
.iter()
|
||||||
|
.filter_map(|node| node.as_tag())
|
||||||
|
.take_while(|tag| !matches!(tag.name().as_bytes(), b"a" | b"link"))
|
||||||
|
.find(|tag| tag.name().as_bytes() == b"base")
|
||||||
|
.map(|base| parse_base(base))
|
||||||
|
.transpose()?
|
||||||
|
.flatten()
|
||||||
|
.unwrap_or_else(|| base.clone());
|
||||||
|
|
||||||
|
// Parse each `<a>` tag, to extract the filename, hash, and URL.
|
||||||
|
let files: Vec<File> = dom
|
||||||
|
.nodes()
|
||||||
|
.iter()
|
||||||
|
.filter_map(|node| node.as_tag())
|
||||||
|
.filter(|link| link.name().as_bytes() == b"a")
|
||||||
|
.map(|link| parse_anchor(link, &base))
|
||||||
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
|
Ok(files)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse the `href` from a `<base>` tag.
|
||||||
|
fn parse_base(base: &HTMLTag) -> Result<Option<Url>, Error> {
|
||||||
|
let Some(Some(href)) = base.attributes().get("href") else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
let href = std::str::from_utf8(href.as_bytes())?;
|
||||||
|
let url = Url::parse(href)?;
|
||||||
|
Ok(Some(url))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse the hash from a fragment, as in: `sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61`
|
||||||
|
fn parse_hash(fragment: &str, url: &Url) -> Result<Hashes, Error> {
|
||||||
|
let mut parts = fragment.split('=');
|
||||||
|
|
||||||
|
// Extract the key and value.
|
||||||
|
let name = parts
|
||||||
|
.next()
|
||||||
|
.ok_or_else(|| Error::FragmentParse(url.clone()))?;
|
||||||
|
let value = parts
|
||||||
|
.next()
|
||||||
|
.ok_or_else(|| Error::FragmentParse(url.clone()))?;
|
||||||
|
|
||||||
|
// Ensure there are no more parts.
|
||||||
|
if parts.next().is_some() {
|
||||||
|
return Err(Error::FragmentParse(url.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(charlie): Support all hash algorithms.
|
||||||
|
if name != "sha256" {
|
||||||
|
return Err(Error::UnsupportedHashAlgorithm(url.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let sha256 = std::str::from_utf8(value.as_bytes())?;
|
||||||
|
let sha256 = sha256.to_string();
|
||||||
|
Ok(Hashes { sha256 })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a [`File`] from an `<a>` tag.
|
||||||
|
fn parse_anchor(link: &HTMLTag, base: &Url) -> Result<File, Error> {
|
||||||
|
// Extract the href.
|
||||||
|
let href = link
|
||||||
|
.attributes()
|
||||||
|
.get("href")
|
||||||
|
.flatten()
|
||||||
|
.ok_or_else(|| Error::MissingHref(base.clone()))?;
|
||||||
|
let href = std::str::from_utf8(href.as_bytes())?;
|
||||||
|
let url = base.join(href)?;
|
||||||
|
|
||||||
|
// Extract the filename from the body text, which MUST match that of
|
||||||
|
// the final path component of the URL.
|
||||||
|
let filename = url
|
||||||
|
.path_segments()
|
||||||
|
.and_then(|segments| segments.last())
|
||||||
|
.ok_or_else(|| Error::MissingFilename(url.clone()))?;
|
||||||
|
|
||||||
|
// Extract the hash, which should be in the fragment.
|
||||||
|
let hashes = url
|
||||||
|
.fragment()
|
||||||
|
.map(|fragment| parse_hash(fragment, &url))
|
||||||
|
.transpose()?
|
||||||
|
.ok_or_else(|| Error::MissingHash(url.clone()))?;
|
||||||
|
|
||||||
|
// Extract the `requires-python` field, which should be set on the
|
||||||
|
// `data-requires-python` attribute.
|
||||||
|
let requires_python =
|
||||||
|
if let Some(requires_python) = link.attributes().get("data-requires-python").flatten() {
|
||||||
|
let requires_python = std::str::from_utf8(requires_python.as_bytes())?;
|
||||||
|
let requires_python =
|
||||||
|
VersionSpecifiers::from_str(requires_python).map_err(Error::Pep440)?;
|
||||||
|
Some(requires_python)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract the `data-dist-info-metadata` field, which should be set on
|
||||||
|
// the `data-dist-info-metadata` attribute.
|
||||||
|
let dist_info_metadata = if let Some(dist_info_metadata) =
|
||||||
|
link.attributes().get("data-dist-info-metadata").flatten()
|
||||||
|
{
|
||||||
|
let dist_info_metadata = std::str::from_utf8(dist_info_metadata.as_bytes())?;
|
||||||
|
match dist_info_metadata {
|
||||||
|
"true" => Some(DistInfoMetadata::Bool(true)),
|
||||||
|
"false" => Some(DistInfoMetadata::Bool(false)),
|
||||||
|
fragment => Some(DistInfoMetadata::Hashes(parse_hash(fragment, &url)?)),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract the `yanked` field, which should be set on the `data-yanked`
|
||||||
|
// attribute.
|
||||||
|
let yanked = if let Some(yanked) = link.attributes().get("data-yanked").flatten() {
|
||||||
|
let yanked = std::str::from_utf8(yanked.as_bytes())?;
|
||||||
|
Some(Yanked::Reason(yanked.to_string()))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(File {
|
||||||
|
dist_info_metadata,
|
||||||
|
yanked,
|
||||||
|
requires_python,
|
||||||
|
hashes,
|
||||||
|
filename: filename.to_string(),
|
||||||
|
// TODO(charlie): Store serialized URLs.
|
||||||
|
url: url.to_string(),
|
||||||
|
size: None,
|
||||||
|
upload_time: None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_file() {
|
||||||
|
let text = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for jinja2</h1>
|
||||||
|
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61">Jinja2-3.1.2-py3-none-any.whl</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!--TIMESTAMP 1703347410-->
|
||||||
|
"#;
|
||||||
|
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
|
||||||
|
let result = parse_simple(text, &base).unwrap();
|
||||||
|
insta::assert_debug_snapshot!(result, @r###"
|
||||||
|
[
|
||||||
|
File {
|
||||||
|
dist_info_metadata: None,
|
||||||
|
filename: "Jinja2-3.1.2-py3-none-any.whl",
|
||||||
|
hashes: Hashes {
|
||||||
|
sha256: "6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
|
||||||
|
},
|
||||||
|
requires_python: None,
|
||||||
|
size: None,
|
||||||
|
upload_time: None,
|
||||||
|
url: "https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
|
||||||
|
yanked: None,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_base() {
|
||||||
|
let text = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<base href="https://index.python.org/">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Links for jinja2</h1>
|
||||||
|
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61">Jinja2-3.1.2-py3-none-any.whl</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!--TIMESTAMP 1703347410-->
|
||||||
|
"#;
|
||||||
|
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
|
||||||
|
let result = parse_simple(text, &base).unwrap();
|
||||||
|
insta::assert_debug_snapshot!(result, @r###"
|
||||||
|
[
|
||||||
|
File {
|
||||||
|
dist_info_metadata: None,
|
||||||
|
filename: "Jinja2-3.1.2-py3-none-any.whl",
|
||||||
|
hashes: Hashes {
|
||||||
|
sha256: "6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
|
||||||
|
},
|
||||||
|
requires_python: None,
|
||||||
|
size: None,
|
||||||
|
upload_time: None,
|
||||||
|
url: "https://index.python.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
|
||||||
|
yanked: None,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_missing_href() {
|
||||||
|
let text = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for jinja2</h1>
|
||||||
|
<a>Jinja2-3.1.2-py3-none-any.whl</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!--TIMESTAMP 1703347410-->
|
||||||
|
"#;
|
||||||
|
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
|
||||||
|
let result = parse_simple(text, &base).unwrap_err();
|
||||||
|
insta::assert_display_snapshot!(result, @"Missing href attribute on URL: https://download.pytorch.org/whl/jinja2/");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_empty_href() {
|
||||||
|
let text = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for jinja2</h1>
|
||||||
|
<a href="">Jinja2-3.1.2-py3-none-any.whl</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!--TIMESTAMP 1703347410-->
|
||||||
|
"#;
|
||||||
|
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
|
||||||
|
let result = parse_simple(text, &base).unwrap_err();
|
||||||
|
insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: https://download.pytorch.org/whl/jinja2/");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_missing_hash() {
|
||||||
|
let text = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for jinja2</h1>
|
||||||
|
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl">Jinja2-3.1.2-py3-none-any.whl</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!--TIMESTAMP 1703347410-->
|
||||||
|
"#;
|
||||||
|
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
|
||||||
|
let result = parse_simple(text, &base).unwrap_err();
|
||||||
|
insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_missing_hash_value() {
|
||||||
|
let text = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for jinja2</h1>
|
||||||
|
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl#sha256">Jinja2-3.1.2-py3-none-any.whl</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!--TIMESTAMP 1703347410-->
|
||||||
|
"#;
|
||||||
|
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
|
||||||
|
let result = parse_simple(text, &base).unwrap_err();
|
||||||
|
insta::assert_display_snapshot!(result, @"Unexpected fragment (expected `#sha256=...`) on URL: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_unknown_hash() {
|
||||||
|
let text = r#"
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for jinja2</h1>
|
||||||
|
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl#sha512=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61">Jinja2-3.1.2-py3-none-any.whl</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
<!--TIMESTAMP 1703347410-->
|
||||||
|
"#;
|
||||||
|
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
|
||||||
|
let result = parse_simple(text, &base).unwrap_err();
|
||||||
|
insta::assert_display_snapshot!(result, @"Unsupported hash algorithm (expected `sha256`) on: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha512=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61");
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,5 +6,6 @@ pub use registry_client::{
|
||||||
|
|
||||||
mod cached_client;
|
mod cached_client;
|
||||||
mod error;
|
mod error;
|
||||||
|
mod html;
|
||||||
mod registry_client;
|
mod registry_client;
|
||||||
mod remote_metadata;
|
mod remote_metadata;
|
||||||
|
|
|
@ -25,7 +25,7 @@ use puffin_normalize::PackageName;
|
||||||
use pypi_types::{File, IndexUrl, IndexUrls, Metadata21, SimpleJson};
|
use pypi_types::{File, IndexUrl, IndexUrls, Metadata21, SimpleJson};
|
||||||
|
|
||||||
use crate::remote_metadata::wheel_metadata_from_remote_zip;
|
use crate::remote_metadata::wheel_metadata_from_remote_zip;
|
||||||
use crate::{CachedClient, CachedClientError, Error};
|
use crate::{html, CachedClient, CachedClientError, Error};
|
||||||
|
|
||||||
/// A builder for an [`RegistryClient`].
|
/// A builder for an [`RegistryClient`].
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
|
@ -135,7 +135,6 @@ impl RegistryClient {
|
||||||
let mut url: Url = index.clone().into();
|
let mut url: Url = index.clone().into();
|
||||||
url.path_segments_mut().unwrap().push(package_name.as_ref());
|
url.path_segments_mut().unwrap().push(package_name.as_ref());
|
||||||
url.path_segments_mut().unwrap().push("");
|
url.path_segments_mut().unwrap().push("");
|
||||||
url.set_query(Some("format=application/vnd.pypi.simple.v1+json"));
|
|
||||||
|
|
||||||
trace!("Fetching metadata for {} from {}", package_name, url);
|
trace!("Fetching metadata for {} from {}", package_name, url);
|
||||||
|
|
||||||
|
@ -153,14 +152,37 @@ impl RegistryClient {
|
||||||
.uncached()
|
.uncached()
|
||||||
.get(url.clone())
|
.get(url.clone())
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept", "application/vnd.pypi.simple.v1+json")
|
.header("Accept", MediaType::accepts())
|
||||||
.build()?;
|
.build()?;
|
||||||
let parse_simple_response = |response: Response| async {
|
let parse_simple_response = |response: Response| async {
|
||||||
let bytes = response.bytes().await?;
|
let content_type = response
|
||||||
let data: SimpleJson = serde_json::from_slice(bytes.as_ref())
|
.headers()
|
||||||
.map_err(|err| Error::from_json_err(err, url))?;
|
.get("content-type")
|
||||||
let metadata = SimpleMetadata::from_package_simple_json(package_name, data);
|
.ok_or_else(|| Error::MissingContentType(url.clone()))?;
|
||||||
Ok(metadata)
|
let content_type = content_type
|
||||||
|
.to_str()
|
||||||
|
.map_err(|err| Error::InvalidContentTypeHeader(url.clone(), err))?;
|
||||||
|
let media_type = content_type.split(';').next().unwrap_or(content_type);
|
||||||
|
let media_type = MediaType::from_str(media_type).ok_or_else(|| {
|
||||||
|
Error::UnsupportedMediaType(url.clone(), media_type.to_string())
|
||||||
|
})?;
|
||||||
|
|
||||||
|
match media_type {
|
||||||
|
MediaType::Json => {
|
||||||
|
let bytes = response.bytes().await?;
|
||||||
|
let data: SimpleJson = serde_json::from_slice(bytes.as_ref())
|
||||||
|
.map_err(|err| Error::from_json_err(err, url.clone()))?;
|
||||||
|
let metadata = SimpleMetadata::from_files(package_name, data.files);
|
||||||
|
Ok(metadata)
|
||||||
|
}
|
||||||
|
MediaType::Html => {
|
||||||
|
let text = response.text().await?;
|
||||||
|
let files = html::parse_simple(&text, &url)
|
||||||
|
.map_err(|err| Error::from_html_err(err, url.clone()))?;
|
||||||
|
let metadata = SimpleMetadata::from_files(package_name, files);
|
||||||
|
Ok(metadata)
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let result = self
|
let result = self
|
||||||
.client
|
.client
|
||||||
|
@ -189,7 +211,7 @@ impl RegistryClient {
|
||||||
|
|
||||||
/// Fetch the metadata for a remote wheel file.
|
/// Fetch the metadata for a remote wheel file.
|
||||||
///
|
///
|
||||||
/// For a remote wheel, we try the following ways to fetch the metadata:
|
/// For a remote wheel, we try the following ways to fetch the metadata:
|
||||||
/// 1. From a [PEP 658](https://peps.python.org/pep-0658/) data-dist-info-metadata url
|
/// 1. From a [PEP 658](https://peps.python.org/pep-0658/) data-dist-info-metadata url
|
||||||
/// 2. From a remote wheel by partial zip reading
|
/// 2. From a remote wheel by partial zip reading
|
||||||
/// 3. From a (temp) download of a remote wheel (this is a fallback, the webserver should support range requests)
|
/// 3. From a (temp) download of a remote wheel (this is a fallback, the webserver should support range requests)
|
||||||
|
@ -239,7 +261,7 @@ impl RegistryClient {
|
||||||
if file
|
if file
|
||||||
.dist_info_metadata
|
.dist_info_metadata
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.is_some_and(pypi_types::Metadata::is_available)
|
.is_some_and(pypi_types::DistInfoMetadata::is_available)
|
||||||
{
|
{
|
||||||
let url = Url::parse(&format!("{}.metadata", file.url))?;
|
let url = Url::parse(&format!("{}.metadata", file.url))?;
|
||||||
|
|
||||||
|
@ -423,11 +445,11 @@ impl SimpleMetadata {
|
||||||
self.0.iter()
|
self.0.iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn from_package_simple_json(package_name: &PackageName, simple_json: SimpleJson) -> Self {
|
fn from_files(package_name: &PackageName, files: Vec<File>) -> Self {
|
||||||
let mut metadata = Self::default();
|
let mut metadata = Self::default();
|
||||||
|
|
||||||
// Group the distributions by version and kind
|
// Group the distributions by version and kind
|
||||||
for file in simple_json.files {
|
for file in files {
|
||||||
if let Some(filename) =
|
if let Some(filename) =
|
||||||
DistFilename::try_from_filename(file.filename.as_str(), package_name)
|
DistFilename::try_from_filename(file.filename.as_str(), package_name)
|
||||||
{
|
{
|
||||||
|
@ -461,3 +483,26 @@ impl IntoIterator for SimpleMetadata {
|
||||||
self.0.into_iter()
|
self.0.into_iter()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
enum MediaType {
|
||||||
|
Json,
|
||||||
|
Html,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MediaType {
|
||||||
|
/// Parse a media type from a string, returning `None` if the media type is not supported.
|
||||||
|
fn from_str(s: &str) -> Option<Self> {
|
||||||
|
match s {
|
||||||
|
"application/vnd.pypi.simple.v1+json" => Some(Self::Json),
|
||||||
|
"application/vnd.pypi.simple.v1+html" | "text/html" => Some(Self::Html),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the `Accept` header value for all supported media types.
|
||||||
|
#[inline]
|
||||||
|
const fn accepts() -> &'static str {
|
||||||
|
"application/vnd.pypi.simple.v1+json, application/vnd.pypi.simple.v1+html;q=0.2, text/html"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -2,7 +2,7 @@ pub use direct_url::{ArchiveInfo, DirInfo, DirectUrl, VcsInfo, VcsKind};
|
||||||
pub use index_url::{IndexUrl, IndexUrls};
|
pub use index_url::{IndexUrl, IndexUrls};
|
||||||
pub use lenient_requirement::LenientVersionSpecifiers;
|
pub use lenient_requirement::LenientVersionSpecifiers;
|
||||||
pub use metadata::{Error, Metadata21};
|
pub use metadata::{Error, Metadata21};
|
||||||
pub use simple_json::{File, Metadata, SimpleJson, Yanked};
|
pub use simple_json::{DistInfoMetadata, File, Hashes, SimpleJson, Yanked};
|
||||||
|
|
||||||
mod direct_url;
|
mod direct_url;
|
||||||
mod index_url;
|
mod index_url;
|
||||||
|
|
|
@ -18,7 +18,7 @@ pub struct SimpleJson {
|
||||||
pub struct File {
|
pub struct File {
|
||||||
// Not PEP 691 compliant alias used by pypi
|
// Not PEP 691 compliant alias used by pypi
|
||||||
#[serde(alias = "data_dist_info_metadata")]
|
#[serde(alias = "data_dist_info_metadata")]
|
||||||
pub dist_info_metadata: Option<Metadata>,
|
pub dist_info_metadata: Option<DistInfoMetadata>,
|
||||||
pub filename: String,
|
pub filename: String,
|
||||||
pub hashes: Hashes,
|
pub hashes: Hashes,
|
||||||
/// Note: Deserialized with [`LenientVersionSpecifiers`] since there are a number of invalid
|
/// Note: Deserialized with [`LenientVersionSpecifiers`] since there are a number of invalid
|
||||||
|
@ -47,12 +47,12 @@ where
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub enum Metadata {
|
pub enum DistInfoMetadata {
|
||||||
Bool(bool),
|
Bool(bool),
|
||||||
Hashes(Hashes),
|
Hashes(Hashes),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Metadata {
|
impl DistInfoMetadata {
|
||||||
pub fn is_available(&self) -> bool {
|
pub fn is_available(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
Self::Bool(is_available) => *is_available,
|
Self::Bool(is_available) => *is_available,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue