From 5bce699ee176ba3eaa204704232fc63755a3b538 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Sun, 24 Dec 2023 11:04:00 -0500 Subject: [PATCH] Add support for HTML indexes (#719) ## Summary This PR adds support for HTML index responses (as with `--index-url=https://download.pytorch.org/whl`). Closes https://github.com/astral-sh/puffin/issues/412. --- Cargo.lock | 8 + Cargo.toml | 1 + .../distribution-filename/src/source_dist.rs | 3 +- crates/puffin-cli/tests/pip_compile.rs | 39 ++ crates/puffin-client/Cargo.toml | 4 +- crates/puffin-client/src/error.rs | 19 +- crates/puffin-client/src/html.rs | 332 ++++++++++++++++++ crates/puffin-client/src/lib.rs | 1 + crates/puffin-client/src/registry_client.rs | 69 +++- crates/pypi-types/src/lib.rs | 2 +- crates/pypi-types/src/simple_json.rs | 6 +- 11 files changed, 465 insertions(+), 19 deletions(-) create mode 100644 crates/puffin-client/src/html.rs diff --git a/Cargo.lock b/Cargo.lock index a39e9e2bb..142bef19d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2373,6 +2373,7 @@ dependencies = [ "futures", "http", "http-cache-semantics", + "insta", "install-wheel-rs", "pep440_rs 0.3.12", "pep508_rs", @@ -2389,6 +2390,7 @@ dependencies = [ "sha2", "tempfile", "thiserror", + "tl", "tokio", "tokio-util", "tracing", @@ -3701,6 +3703,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tl" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5e993a1c7c32fdf90a308cec4d457f507b2573acc909bd6e7a092321664fdb3" + [[package]] name = "tokio" version = "1.33.0" diff --git a/Cargo.toml b/Cargo.toml index 935397abf..170372d26 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,7 @@ target-lexicon = { version = "0.12.12" } tempfile = { version = "3.8.1" } textwrap = { version = "0.15.2" } thiserror = { version = "1.0.50" } +tl = { version = "0.7.7" } tokio = { version = "1.33.0", features = ["rt-multi-thread"] } tokio-util = { version = "0.7.10", features = ["compat"] } toml = { version = "0.8.6" } diff --git a/crates/distribution-filename/src/source_dist.rs b/crates/distribution-filename/src/source_dist.rs index 10ffb5927..4db1d317a 100644 --- a/crates/distribution-filename/src/source_dist.rs +++ b/crates/distribution-filename/src/source_dist.rs @@ -8,7 +8,8 @@ use thiserror::Error; use pep440_rs::Version; use puffin_normalize::{InvalidNameError, PackageName}; -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum SourceDistExtension { Zip, TarGz, diff --git a/crates/puffin-cli/tests/pip_compile.rs b/crates/puffin-cli/tests/pip_compile.rs index 136061448..3a3b3c2cc 100644 --- a/crates/puffin-cli/tests/pip_compile.rs +++ b/crates/puffin-cli/tests/pip_compile.rs @@ -2701,3 +2701,42 @@ fn cache_errors_are_non_fatal() -> Result<()> { Ok(()) } } + +/// Resolve a distribution from an HTML-only registry. +#[test] +fn compile_html() -> Result<()> { + let temp_dir = TempDir::new()?; + let cache_dir = TempDir::new()?; + let venv = create_venv_py312(&temp_dir, &cache_dir); + + let requirements_in = temp_dir.child("requirements.in"); + requirements_in.write_str("jinja2<=3.1.2")?; + + insta::with_settings!({ + filters => INSTA_FILTERS.to_vec() + }, { + assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME)) + .arg("pip-compile") + .arg("requirements.in") + .arg("--cache-dir") + .arg(cache_dir.path()) + .arg("--index-url") + .arg("https://download.pytorch.org/whl") + .env("VIRTUAL_ENV", venv.as_os_str()) + .current_dir(&temp_dir), @r###" + success: true + exit_code: 0 + ----- stdout ----- + # This file was autogenerated by Puffin v0.0.1 via the following command: + # puffin pip-compile requirements.in --cache-dir [CACHE_DIR] + jinja2==3.1.2 + markupsafe==2.1.3 + # via jinja2 + + ----- stderr ----- + Resolved 2 packages in [TIME] + "###); + }); + + Ok(()) +} diff --git a/crates/puffin-client/Cargo.toml b/crates/puffin-client/Cargo.toml index 73f77b4a8..0709d9662 100644 --- a/crates/puffin-client/Cargo.toml +++ b/crates/puffin-client/Cargo.toml @@ -4,7 +4,7 @@ version = "0.0.1" edition = "2021" [dependencies] -distribution-filename = { path = "../distribution-filename" } +distribution-filename = { path = "../distribution-filename", features = ["serde"] } distribution-types = { path = "../distribution-types" } install-wheel-rs = { path = "../install-wheel-rs" } pep440_rs = { path = "../pep440-rs" } @@ -28,6 +28,7 @@ serde_json = { workspace = true } sha2 = { workspace = true } tempfile = { workspace = true } thiserror = { workspace = true } +tl = { workspace = true } tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } tracing = { workspace = true } @@ -37,4 +38,5 @@ url = { workspace = true } pep508_rs = { path = "../pep508-rs" } anyhow = { workspace = true } +insta = { version = "1.34.0" } tokio = { workspace = true, features = ["fs", "macros"] } diff --git a/crates/puffin-client/src/error.rs b/crates/puffin-client/src/error.rs index dd3aa6bba..77d8eed14 100644 --- a/crates/puffin-client/src/error.rs +++ b/crates/puffin-client/src/error.rs @@ -5,6 +5,7 @@ use async_zip::error::ZipError; use thiserror::Error; use url::Url; +use crate::html; use distribution_filename::{WheelFilename, WheelFilenameError}; use puffin_normalize::PackageName; @@ -49,6 +50,9 @@ pub enum Error { #[error("Received some unexpected JSON from {url}")] BadJson { source: serde_json::Error, url: Url }, + #[error("Received some unexpected HTML from {url}")] + BadHtml { source: html::Error, url: Url }, + #[error(transparent)] AsyncHttpRangeReader(#[from] AsyncHttpRangeReaderError), @@ -82,10 +86,23 @@ pub enum Error { /// An [`io::Error`] with a filename attached #[error(transparent)] Persist(#[from] tempfile::PersistError), + + #[error("Missing `Content-Type` header for {0}")] + MissingContentType(Url), + + #[error("Invalid `Content-Type` header for {0}")] + InvalidContentTypeHeader(Url, #[source] http::header::ToStrError), + + #[error("Unsupported `Content-Type` \"{1}\" for {0}")] + UnsupportedMediaType(Url, String), } impl Error { - pub fn from_json_err(err: serde_json::Error, url: Url) -> Self { + pub(crate) fn from_json_err(err: serde_json::Error, url: Url) -> Self { Self::BadJson { source: err, url } } + + pub(crate) fn from_html_err(err: html::Error, url: Url) -> Self { + Self::BadHtml { source: err, url } + } } diff --git a/crates/puffin-client/src/html.rs b/crates/puffin-client/src/html.rs new file mode 100644 index 000000000..7133b3661 --- /dev/null +++ b/crates/puffin-client/src/html.rs @@ -0,0 +1,332 @@ +use std::str::FromStr; + +use tl::HTMLTag; +use url::Url; + +use pep440_rs::VersionSpecifiers; +use pypi_types::{DistInfoMetadata, File, Hashes, Yanked}; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error(transparent)] + Utf8(#[from] std::str::Utf8Error), + + #[error(transparent)] + UrlParse(#[from] url::ParseError), + + #[error("Missing href attribute on URL: {0}")] + MissingHref(Url), + + #[error("Expected distribution filename as last path component of URL: {0}")] + MissingFilename(Url), + + #[error("Missing hash attribute on URL: {0}")] + MissingHash(Url), + + #[error("Unexpected fragment (expected `#sha256=...`) on URL: {0}")] + FragmentParse(Url), + + #[error("Unsupported hash algorithm (expected `sha256`) on: {0}")] + UnsupportedHashAlgorithm(Url), + + #[error("Invalid `requires-python` specifier: {0}")] + Pep440(#[source] pep440_rs::Pep440Error), +} + +/// Parse the list of [`File`]s from the simple HTML page returned by the given URL. +pub(crate) fn parse_simple(text: &str, base: &Url) -> Result, Error> { + let dom = tl::parse(text, tl::ParserOptions::default()).unwrap(); + + // Parse the first `` tag, if any, to determine the base URL to which all + // relative URLs should be resolved. The HTML spec requires that the `` tag + // appear before other tags with attribute values of URLs. + let base = dom + .nodes() + .iter() + .filter_map(|node| node.as_tag()) + .take_while(|tag| !matches!(tag.name().as_bytes(), b"a" | b"link")) + .find(|tag| tag.name().as_bytes() == b"base") + .map(|base| parse_base(base)) + .transpose()? + .flatten() + .unwrap_or_else(|| base.clone()); + + // Parse each `` tag, to extract the filename, hash, and URL. + let files: Vec = dom + .nodes() + .iter() + .filter_map(|node| node.as_tag()) + .filter(|link| link.name().as_bytes() == b"a") + .map(|link| parse_anchor(link, &base)) + .collect::, _>>()?; + + Ok(files) +} + +/// Parse the `href` from a `` tag. +fn parse_base(base: &HTMLTag) -> Result, Error> { + let Some(Some(href)) = base.attributes().get("href") else { + return Ok(None); + }; + let href = std::str::from_utf8(href.as_bytes())?; + let url = Url::parse(href)?; + Ok(Some(url)) +} + +/// Parse the hash from a fragment, as in: `sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61` +fn parse_hash(fragment: &str, url: &Url) -> Result { + let mut parts = fragment.split('='); + + // Extract the key and value. + let name = parts + .next() + .ok_or_else(|| Error::FragmentParse(url.clone()))?; + let value = parts + .next() + .ok_or_else(|| Error::FragmentParse(url.clone()))?; + + // Ensure there are no more parts. + if parts.next().is_some() { + return Err(Error::FragmentParse(url.clone())); + } + + // TODO(charlie): Support all hash algorithms. + if name != "sha256" { + return Err(Error::UnsupportedHashAlgorithm(url.clone())); + } + + let sha256 = std::str::from_utf8(value.as_bytes())?; + let sha256 = sha256.to_string(); + Ok(Hashes { sha256 }) +} + +/// Parse a [`File`] from an `` tag. +fn parse_anchor(link: &HTMLTag, base: &Url) -> Result { + // Extract the href. + let href = link + .attributes() + .get("href") + .flatten() + .ok_or_else(|| Error::MissingHref(base.clone()))?; + let href = std::str::from_utf8(href.as_bytes())?; + let url = base.join(href)?; + + // Extract the filename from the body text, which MUST match that of + // the final path component of the URL. + let filename = url + .path_segments() + .and_then(|segments| segments.last()) + .ok_or_else(|| Error::MissingFilename(url.clone()))?; + + // Extract the hash, which should be in the fragment. + let hashes = url + .fragment() + .map(|fragment| parse_hash(fragment, &url)) + .transpose()? + .ok_or_else(|| Error::MissingHash(url.clone()))?; + + // Extract the `requires-python` field, which should be set on the + // `data-requires-python` attribute. + let requires_python = + if let Some(requires_python) = link.attributes().get("data-requires-python").flatten() { + let requires_python = std::str::from_utf8(requires_python.as_bytes())?; + let requires_python = + VersionSpecifiers::from_str(requires_python).map_err(Error::Pep440)?; + Some(requires_python) + } else { + None + }; + + // Extract the `data-dist-info-metadata` field, which should be set on + // the `data-dist-info-metadata` attribute. + let dist_info_metadata = if let Some(dist_info_metadata) = + link.attributes().get("data-dist-info-metadata").flatten() + { + let dist_info_metadata = std::str::from_utf8(dist_info_metadata.as_bytes())?; + match dist_info_metadata { + "true" => Some(DistInfoMetadata::Bool(true)), + "false" => Some(DistInfoMetadata::Bool(false)), + fragment => Some(DistInfoMetadata::Hashes(parse_hash(fragment, &url)?)), + } + } else { + None + }; + + // Extract the `yanked` field, which should be set on the `data-yanked` + // attribute. + let yanked = if let Some(yanked) = link.attributes().get("data-yanked").flatten() { + let yanked = std::str::from_utf8(yanked.as_bytes())?; + Some(Yanked::Reason(yanked.to_string())) + } else { + None + }; + + Ok(File { + dist_info_metadata, + yanked, + requires_python, + hashes, + filename: filename.to_string(), + // TODO(charlie): Store serialized URLs. + url: url.to_string(), + size: None, + upload_time: None, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_file() { + let text = r#" + + + +

Links for jinja2

+
Jinja2-3.1.2-py3-none-any.whl
+ + + + "#; + let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); + let result = parse_simple(text, &base).unwrap(); + insta::assert_debug_snapshot!(result, @r###" + [ + File { + dist_info_metadata: None, + filename: "Jinja2-3.1.2-py3-none-any.whl", + hashes: Hashes { + sha256: "6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", + }, + requires_python: None, + size: None, + upload_time: None, + url: "https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", + yanked: None, + }, + ] + "###); + } + + #[test] + fn parse_base() { + let text = r#" + + + + + + +

Links for jinja2

+ Jinja2-3.1.2-py3-none-any.whl
+ + + + "#; + let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); + let result = parse_simple(text, &base).unwrap(); + insta::assert_debug_snapshot!(result, @r###" + [ + File { + dist_info_metadata: None, + filename: "Jinja2-3.1.2-py3-none-any.whl", + hashes: Hashes { + sha256: "6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", + }, + requires_python: None, + size: None, + upload_time: None, + url: "https://index.python.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", + yanked: None, + }, + ] + "###); + } + + #[test] + fn parse_missing_href() { + let text = r#" + + + +

Links for jinja2

+ Jinja2-3.1.2-py3-none-any.whl
+ + + + "#; + let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); + let result = parse_simple(text, &base).unwrap_err(); + insta::assert_display_snapshot!(result, @"Missing href attribute on URL: https://download.pytorch.org/whl/jinja2/"); + } + + #[test] + fn parse_empty_href() { + let text = r#" + + + +

Links for jinja2

+ Jinja2-3.1.2-py3-none-any.whl
+ + + + "#; + let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); + let result = parse_simple(text, &base).unwrap_err(); + insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: https://download.pytorch.org/whl/jinja2/"); + } + + #[test] + fn parse_missing_hash() { + let text = r#" + + + +

Links for jinja2

+ Jinja2-3.1.2-py3-none-any.whl
+ + + + "#; + let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); + let result = parse_simple(text, &base).unwrap_err(); + insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl"); + } + + #[test] + fn parse_missing_hash_value() { + let text = r#" + + + +

Links for jinja2

+ Jinja2-3.1.2-py3-none-any.whl
+ + + + "#; + let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); + let result = parse_simple(text, &base).unwrap_err(); + insta::assert_display_snapshot!(result, @"Unexpected fragment (expected `#sha256=...`) on URL: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256"); + } + + #[test] + fn parse_unknown_hash() { + let text = r#" + + + +

Links for jinja2

+ Jinja2-3.1.2-py3-none-any.whl
+ + + + "#; + let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); + let result = parse_simple(text, &base).unwrap_err(); + insta::assert_display_snapshot!(result, @"Unsupported hash algorithm (expected `sha256`) on: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha512=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"); + } +} diff --git a/crates/puffin-client/src/lib.rs b/crates/puffin-client/src/lib.rs index c002a2401..32d5e1120 100644 --- a/crates/puffin-client/src/lib.rs +++ b/crates/puffin-client/src/lib.rs @@ -6,5 +6,6 @@ pub use registry_client::{ mod cached_client; mod error; +mod html; mod registry_client; mod remote_metadata; diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index dcfc9172a..d66bc5079 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -25,7 +25,7 @@ use puffin_normalize::PackageName; use pypi_types::{File, IndexUrl, IndexUrls, Metadata21, SimpleJson}; use crate::remote_metadata::wheel_metadata_from_remote_zip; -use crate::{CachedClient, CachedClientError, Error}; +use crate::{html, CachedClient, CachedClientError, Error}; /// A builder for an [`RegistryClient`]. #[derive(Debug, Clone)] @@ -135,7 +135,6 @@ impl RegistryClient { let mut url: Url = index.clone().into(); url.path_segments_mut().unwrap().push(package_name.as_ref()); url.path_segments_mut().unwrap().push(""); - url.set_query(Some("format=application/vnd.pypi.simple.v1+json")); trace!("Fetching metadata for {} from {}", package_name, url); @@ -153,14 +152,37 @@ impl RegistryClient { .uncached() .get(url.clone()) .header("Accept-Encoding", "gzip") - .header("Accept", "application/vnd.pypi.simple.v1+json") + .header("Accept", MediaType::accepts()) .build()?; let parse_simple_response = |response: Response| async { - let bytes = response.bytes().await?; - let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) - .map_err(|err| Error::from_json_err(err, url))?; - let metadata = SimpleMetadata::from_package_simple_json(package_name, data); - Ok(metadata) + let content_type = response + .headers() + .get("content-type") + .ok_or_else(|| Error::MissingContentType(url.clone()))?; + let content_type = content_type + .to_str() + .map_err(|err| Error::InvalidContentTypeHeader(url.clone(), err))?; + let media_type = content_type.split(';').next().unwrap_or(content_type); + let media_type = MediaType::from_str(media_type).ok_or_else(|| { + Error::UnsupportedMediaType(url.clone(), media_type.to_string()) + })?; + + match media_type { + MediaType::Json => { + let bytes = response.bytes().await?; + let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) + .map_err(|err| Error::from_json_err(err, url.clone()))?; + let metadata = SimpleMetadata::from_files(package_name, data.files); + Ok(metadata) + } + MediaType::Html => { + let text = response.text().await?; + let files = html::parse_simple(&text, &url) + .map_err(|err| Error::from_html_err(err, url.clone()))?; + let metadata = SimpleMetadata::from_files(package_name, files); + Ok(metadata) + } + } }; let result = self .client @@ -189,7 +211,7 @@ impl RegistryClient { /// Fetch the metadata for a remote wheel file. /// - /// For a remote wheel, we try the following ways to fetch the metadata: + /// For a remote wheel, we try the following ways to fetch the metadata: /// 1. From a [PEP 658](https://peps.python.org/pep-0658/) data-dist-info-metadata url /// 2. From a remote wheel by partial zip reading /// 3. From a (temp) download of a remote wheel (this is a fallback, the webserver should support range requests) @@ -239,7 +261,7 @@ impl RegistryClient { if file .dist_info_metadata .as_ref() - .is_some_and(pypi_types::Metadata::is_available) + .is_some_and(pypi_types::DistInfoMetadata::is_available) { let url = Url::parse(&format!("{}.metadata", file.url))?; @@ -423,11 +445,11 @@ impl SimpleMetadata { self.0.iter() } - fn from_package_simple_json(package_name: &PackageName, simple_json: SimpleJson) -> Self { + fn from_files(package_name: &PackageName, files: Vec) -> Self { let mut metadata = Self::default(); // Group the distributions by version and kind - for file in simple_json.files { + for file in files { if let Some(filename) = DistFilename::try_from_filename(file.filename.as_str(), package_name) { @@ -461,3 +483,26 @@ impl IntoIterator for SimpleMetadata { self.0.into_iter() } } + +#[derive(Debug)] +enum MediaType { + Json, + Html, +} + +impl MediaType { + /// Parse a media type from a string, returning `None` if the media type is not supported. + fn from_str(s: &str) -> Option { + match s { + "application/vnd.pypi.simple.v1+json" => Some(Self::Json), + "application/vnd.pypi.simple.v1+html" | "text/html" => Some(Self::Html), + _ => None, + } + } + + /// Return the `Accept` header value for all supported media types. + #[inline] + const fn accepts() -> &'static str { + "application/vnd.pypi.simple.v1+json, application/vnd.pypi.simple.v1+html;q=0.2, text/html" + } +} diff --git a/crates/pypi-types/src/lib.rs b/crates/pypi-types/src/lib.rs index da5940d0b..424de3b68 100644 --- a/crates/pypi-types/src/lib.rs +++ b/crates/pypi-types/src/lib.rs @@ -2,7 +2,7 @@ pub use direct_url::{ArchiveInfo, DirInfo, DirectUrl, VcsInfo, VcsKind}; pub use index_url::{IndexUrl, IndexUrls}; pub use lenient_requirement::LenientVersionSpecifiers; pub use metadata::{Error, Metadata21}; -pub use simple_json::{File, Metadata, SimpleJson, Yanked}; +pub use simple_json::{DistInfoMetadata, File, Hashes, SimpleJson, Yanked}; mod direct_url; mod index_url; diff --git a/crates/pypi-types/src/simple_json.rs b/crates/pypi-types/src/simple_json.rs index 3aed134dc..4c12a172c 100644 --- a/crates/pypi-types/src/simple_json.rs +++ b/crates/pypi-types/src/simple_json.rs @@ -18,7 +18,7 @@ pub struct SimpleJson { pub struct File { // Not PEP 691 compliant alias used by pypi #[serde(alias = "data_dist_info_metadata")] - pub dist_info_metadata: Option, + pub dist_info_metadata: Option, pub filename: String, pub hashes: Hashes, /// Note: Deserialized with [`LenientVersionSpecifiers`] since there are a number of invalid @@ -47,12 +47,12 @@ where #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(untagged)] -pub enum Metadata { +pub enum DistInfoMetadata { Bool(bool), Hashes(Hashes), } -impl Metadata { +impl DistInfoMetadata { pub fn is_available(&self) -> bool { match self { Self::Bool(is_available) => *is_available,