URL-decode hashes in HTML fragments (#3655)

## Summary

Closes https://github.com/astral-sh/uv/issues/3654
This commit is contained in:
Charlie Marsh 2024-05-18 22:19:55 -04:00 committed by GitHub
parent 53c2551fac
commit 963f2a778b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -145,12 +145,14 @@ impl SimpleHtml {
// Extract the hash, which should be in the fragment.
let decoded = html_escape::decode_html_entities(href);
let (path, hashes) = if let Some((path, fragment)) = decoded.split_once('#') {
let fragment = urlencoding::decode(fragment)
.map_err(|_| Error::FragmentParse(fragment.to_string()))?;
(
path,
if fragment.trim().is_empty() {
Hashes::default()
} else {
Self::parse_hash(fragment)?
Self::parse_hash(&fragment)?
},
)
} else {
@ -488,6 +490,62 @@ mod tests {
"###);
}
#[test]
fn parse_encoded_fragment() {
let text = r#"
<!DOCTYPE html>
<html>
<body>
<h1>Links for jinja2</h1>
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl#sha256%3D4095ada29e51070f7d199a0a5bdf5c8d8e238e03f0bf4dcc02571e78c9ae800d">Jinja2-3.1.2-py3-none-any.whl</a><br/>
</body>
</html>
<!--TIMESTAMP 1703347410-->
"#;
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
let result = SimpleHtml::parse(text, &base).unwrap();
insta::assert_debug_snapshot!(result, @r###"
SimpleHtml {
base: BaseUrl(
Url {
scheme: "https",
cannot_be_a_base: false,
username: "",
password: None,
host: Some(
Domain(
"download.pytorch.org",
),
),
port: None,
path: "/whl/jinja2/",
query: None,
fragment: None,
},
),
files: [
File {
dist_info_metadata: None,
filename: "Jinja2-3.1.2-py3-none-any.whl",
hashes: Hashes {
md5: None,
sha256: Some(
"4095ada29e51070f7d199a0a5bdf5c8d8e238e03f0bf4dcc02571e78c9ae800d",
),
sha384: None,
sha512: None,
},
requires_python: None,
size: None,
upload_time: None,
url: "/whl/Jinja2-3.1.2-py3-none-any.whl#sha256%3D4095ada29e51070f7d199a0a5bdf5c8d8e238e03f0bf4dcc02571e78c9ae800d",
yanked: None,
},
],
}
"###);
}
#[test]
fn parse_quoted_filepath() {
let text = r#"