Make hashes optional (#910)

There is no guarantee that indexes provide hashes at all or the sha256
we support specifically. [PEP
503](https://peps.python.org/pep-0503/#specification):

> The URL SHOULD include a hash in the form of a URL fragment with the
following syntax: #<hashname>=<hashvalue>, where <hashname> is the
lowercase name of the hash function (such as sha256) and <hashvalue> is
the hex encoded digest.

We instead use the url as input to generate a hash when caching.
This commit is contained in:
konsti 2024-01-14 22:32:55 +01:00 committed by GitHub
parent 9ad19b7e54
commit 5ffbfadf66
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 104 additions and 41 deletions

2
Cargo.lock generated
View file

@ -848,6 +848,7 @@ dependencies = [
"anyhow",
"cache-key",
"chrono",
"data-encoding",
"distribution-filename",
"fs-err",
"once_cell",
@ -860,6 +861,7 @@ dependencies = [
"rustc-hash",
"serde",
"serde_json",
"sha2",
"thiserror",
"url",
]

View file

@ -24,10 +24,12 @@ requirements-txt = { path = "../requirements-txt" }
anyhow = { workspace = true }
chrono = { workspace = true, features = ["serde"] }
data-encoding = { workspace = true }
fs-err = { workspace = true }
once_cell = { workspace = true }
rustc-hash = { workspace = true }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
sha2 = { workspace = true }
thiserror = { workspace = true }
url = { workspace = true }

View file

@ -18,6 +18,12 @@ impl DistributionId {
}
}
impl DistributionId {
pub fn as_str(&self) -> &str {
&self.0
}
}
/// A unique identifier for a resource, like a URL or a Git repository.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ResourceId(String);

View file

@ -652,11 +652,19 @@ impl Identifier for Url {
impl Identifier for File {
fn distribution_id(&self) -> DistributionId {
DistributionId::new(self.hashes.sha256.clone())
if let Some(hash) = &self.hashes.sha256 {
DistributionId::new(hash)
} else {
self.url.distribution_id()
}
}
fn resource_id(&self) -> ResourceId {
ResourceId::new(self.hashes.sha256.clone())
if let Some(hash) = &self.hashes.sha256 {
ResourceId::new(hash)
} else {
self.url.resource_id()
}
}
}

View file

@ -82,7 +82,9 @@ impl SimpleHtml {
let sha256 = std::str::from_utf8(value.as_bytes())?;
let sha256 = sha256.to_string();
Ok(Hashes { sha256 })
Ok(Hashes {
sha256: Some(sha256),
})
}
/// Parse a [`File`] from an `<a>` tag.
@ -96,10 +98,12 @@ impl SimpleHtml {
.ok_or(Error::MissingHref)?;
let href = std::str::from_utf8(href.as_bytes())?;
// Split the base and the fragment.
let (path, fragment) = href
.split_once('#')
.ok_or_else(|| Error::MissingHash(href.to_string()))?;
let (path, hashes) = if let Some((path, fragment)) = href.split_once('#') {
// Extract the hash, which should be in the fragment.
(path, Self::parse_hash(fragment)?)
} else {
(href, Hashes::default())
};
// Extract the filename from the body text, which MUST match that of
// the final path component of the URL.
@ -108,9 +112,6 @@ impl SimpleHtml {
.last()
.ok_or_else(|| Error::MissingFilename(href.to_string()))?;
// Extract the hash, which should be in the fragment.
let hashes = Self::parse_hash(fragment)?;
// Extract the `requires-python` field, which should be set on the
// `data-requires-python` attribute.
let requires_python = if let Some(requires_python) =
@ -234,7 +235,9 @@ mod tests {
dist_info_metadata: None,
filename: "Jinja2-3.1.2-py3-none-any.whl",
hashes: Hashes {
sha256: "6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
sha256: Some(
"6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
),
},
requires_python: None,
size: None,
@ -288,7 +291,9 @@ mod tests {
dist_info_metadata: None,
filename: "Jinja2-3.1.2-py3-none-any.whl",
hashes: Hashes {
sha256: "6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
sha256: Some(
"6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61",
),
},
requires_python: None,
size: None,
@ -301,6 +306,57 @@ mod tests {
"###);
}
#[test]
fn parse_missing_hash() {
let text = r#"
<!DOCTYPE html>
<html>
<body>
<h1>Links for jinja2</h1>
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl">Jinja2-3.1.2-py3-none-any.whl</a><br/>
</body>
</html>
<!--TIMESTAMP 1703347410-->
"#;
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
let result = SimpleHtml::parse(text, &base).unwrap();
insta::assert_debug_snapshot!(result, @r###"
SimpleHtml {
base: BaseUrl(
Url {
scheme: "https",
cannot_be_a_base: false,
username: "",
password: None,
host: Some(
Domain(
"download.pytorch.org",
),
),
port: None,
path: "/whl/jinja2/",
query: None,
fragment: None,
},
),
files: [
File {
dist_info_metadata: None,
filename: "Jinja2-3.1.2-py3-none-any.whl",
hashes: Hashes {
sha256: None,
},
requires_python: None,
size: None,
upload_time: None,
url: "/whl/Jinja2-3.1.2-py3-none-any.whl",
yanked: None,
},
],
}
"###);
}
#[test]
fn parse_missing_href() {
let text = r#"
@ -335,23 +391,6 @@ mod tests {
insta::assert_display_snapshot!(result, @"Missing href attribute on anchor link");
}
#[test]
fn parse_missing_hash() {
let text = r#"
<!DOCTYPE html>
<html>
<body>
<h1>Links for jinja2</h1>
<a href="/whl/Jinja2-3.1.2-py3-none-any.whl">Jinja2-3.1.2-py3-none-any.whl</a><br/>
</body>
</html>
<!--TIMESTAMP 1703347410-->
"#;
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
let result = SimpleHtml::parse(text, &base).unwrap_err();
insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: /whl/Jinja2-3.1.2-py3-none-any.whl");
}
#[test]
fn parse_missing_hash_value() {
let text = r#"

View file

@ -16,8 +16,8 @@ use zip::ZipArchive;
use distribution_filename::WheelFilename;
use distribution_types::{
DirectArchiveUrl, DirectGitUrl, Dist, GitSourceDist, LocalEditable, Name, PathSourceDist,
RemoteSource, SourceDist,
DirectArchiveUrl, DirectGitUrl, Dist, GitSourceDist, Identifier, LocalEditable, Name,
PathSourceDist, RemoteSource, SourceDist,
};
use install_wheel_rs::read_dist_info;
use platform_tags::Tags;
@ -104,7 +104,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
CacheBucket::BuiltWheels,
WheelCache::Index(&registry_source_dist.index)
.remote_wheel_dir(registry_source_dist.name.as_ref())
.join(&registry_source_dist.file.hashes.sha256[..16]),
.join(&registry_source_dist.file.distribution_id().as_str()[..16]),
);
self.url(
@ -160,7 +160,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
CacheBucket::BuiltWheels,
WheelCache::Index(&registry_source_dist.index)
.remote_wheel_dir(registry_source_dist.name.as_ref())
.join(&registry_source_dist.file.hashes.sha256[..16]),
.join(&registry_source_dist.file.distribution_id().as_str()[..16]),
);
self.url_metadata(

View file

@ -270,8 +270,10 @@ impl std::fmt::Display for DisplayResolutionGraph<'_> {
.filter(|hashes| !hashes.is_empty())
{
for hash in hashes {
writeln!(f, " \\")?;
write!(f, " --hash={hash}")?;
if let Some(hash) = hash.to_string() {
writeln!(f, " \\")?;
write!(f, " --hash={hash}")?;
}
}
}
}

View file

@ -85,14 +85,18 @@ impl Yanked {
///
/// PEP 691 says multiple hashes can be included and the interpretation is left to the client, we
/// only support SHA 256 atm.
#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Serialize, Deserialize)]
#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Default, Serialize, Deserialize)]
pub struct Hashes {
// TODO(charlie): Hashes should be optional.
pub sha256: String,
pub sha256: Option<String>,
}
impl std::fmt::Display for Hashes {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "sha256:{}", self.sha256)
impl Hashes {
/// Format as `<algorithm>:<hash>`.
///
/// Currently limited to SHA256.
pub fn to_string(&self) -> Option<String> {
self.sha256
.as_ref()
.map(|sha256| format!("sha256:{sha256}"))
}
}