Deduplicate various .dist-info/METADATA read implementations (#531)

Closes https://github.com/astral-sh/puffin/issues/484.
This commit is contained in:
Charlie Marsh 2023-12-03 21:29:00 -05:00 committed by GitHub
parent fa3107b173
commit 3b55d0b295
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 105 additions and 164 deletions

View file

@ -1,14 +1,16 @@
//! Takes a wheel and installs it into a venv..
use std::io;
use std::io::{Read, Seek};
use std::path::PathBuf;
use std::str::FromStr;
use distribution_filename::WheelFilename;
use platform_info::PlatformInfoError;
use thiserror::Error;
use zip::result::ZipError;
use zip::ZipArchive;
use distribution_filename::WheelFilename;
pub use install_location::{normalize_name, InstallLocation, LockedDir};
use pep440_rs::Version;
use platform_host::{Arch, Os};
@ -80,8 +82,12 @@ pub enum Error {
MissingDistInfo,
#[error("Multiple .dist-info directories found: {0}")]
MultipleDistInfo(String),
#[error("Invalid wheel size")]
InvalidSize,
}
/// Find the `dist-info` directory from a list of files.
///
/// The metadata name may be uppercase, while the wheel and dist info names are lowercase, or
/// the metadata name and the dist info name are lowercase, while the wheel name is uppercase.
/// Either way, we just search the wheel for the name.
@ -124,12 +130,32 @@ pub fn find_dist_info<'a, T: Copy>(
Ok((payload, dist_info_dir))
}
/// Given an archive, read the `dist-info` metadata into a buffer.
pub fn read_dist_info(
filename: &WheelFilename,
archive: &mut ZipArchive<impl Read + Seek + Sized>,
) -> Result<Vec<u8>, Error> {
let dist_info_dir = find_dist_info(filename, archive.file_names().map(|name| (name, name)))?.1;
let mut file = archive
.by_name(&format!("{dist_info_dir}/METADATA"))
.map_err(|err| Error::Zip(filename.to_string(), err))?;
#[allow(clippy::cast_possible_truncation)]
let mut buffer = Vec::with_capacity(file.size() as usize);
file.read_to_end(&mut buffer)?;
Ok(buffer)
}
#[cfg(test)]
mod test {
use crate::find_dist_info;
use distribution_filename::WheelFilename;
use std::str::FromStr;
use distribution_filename::WheelFilename;
use crate::find_dist_info;
#[test]
fn test_dot_in_name() {
let files = [

View file

@ -6,14 +6,13 @@ use std::path::Path;
use configparser::ini::Ini;
use fs_err as fs;
use fs_err::File;
use mailparse::MailHeaderMap;
use tracing::{debug, span, Level};
use pypi_types::DirectUrl;
use crate::install_location::InstallLocation;
use crate::wheel::{
extra_dist_info, install_data, parse_wheel_version, read_scripts_from_section,
extra_dist_info, install_data, parse_metadata, parse_wheel_version, read_scripts_from_section,
write_script_entrypoints,
};
use crate::{read_record_file, Error, Script};
@ -49,7 +48,8 @@ pub fn install_wheel(
};
let dist_info_prefix = find_dist_info(&wheel)?;
let (name, _version) = read_metadata(&dist_info_prefix, &wheel)?;
let metadata = dist_info_metadata(&dist_info_prefix, &wheel)?;
let (name, _version) = parse_metadata(&dist_info_prefix, &metadata)?;
let _my_span = span!(Level::DEBUG, "install_wheel", name);
@ -128,11 +128,9 @@ pub fn install_wheel(
Ok(())
}
/// The metadata name may be uppercase, while the wheel and dist info names are lowercase, or
/// the metadata name and the dist info name are lowercase, while the wheel name is uppercase.
/// Either way, we just search the wheel for the name
/// Find the `dist-info` directory in an unzipped wheel.
///
/// <https://github.com/PyO3/python-pkginfo-rs>
/// See: <https://github.com/PyO3/python-pkginfo-rs>
fn find_dist_info(path: impl AsRef<Path>) -> Result<String, Error> {
// Iterate over `path` to find the `.dist-info` directory. It should be at the top-level.
let Some(dist_info) = fs::read_dir(path.as_ref())?.find_map(|entry| {
@ -162,52 +160,12 @@ fn find_dist_info(path: impl AsRef<Path>) -> Result<String, Error> {
Ok(dist_info_prefix.to_string_lossy().to_string())
}
/// <https://github.com/PyO3/python-pkginfo-rs>
fn read_metadata(
dist_info_prefix: &str,
wheel: impl AsRef<Path>,
) -> Result<(String, String), Error> {
/// Read the `dist-info` metadata from a directory.
fn dist_info_metadata(dist_info_prefix: &str, wheel: impl AsRef<Path>) -> Result<Vec<u8>, Error> {
let metadata_file = wheel
.as_ref()
.join(format!("{dist_info_prefix}.dist-info/METADATA"));
let content = fs::read(&metadata_file)?;
// HACK: trick mailparse to parse as UTF-8 instead of ASCII
let mut mail = b"Content-Type: text/plain; charset=utf-8\n".to_vec();
mail.extend_from_slice(&content);
let msg = mailparse::parse_mail(&mail).map_err(|err| {
Error::InvalidWheel(format!("Invalid {}: {}", metadata_file.display(), err))
})?;
let headers = msg.get_headers();
let metadata_version =
headers
.get_first_value("Metadata-Version")
.ok_or(Error::InvalidWheel(format!(
"No Metadata-Version field in {}",
metadata_file.display()
)))?;
// Crude but it should do https://packaging.python.org/en/latest/specifications/core-metadata/#metadata-version
// At time of writing:
// > Version of the file format; legal values are “1.0”, “1.1”, “1.2”, “2.1”, “2.2”, and “2.3”.
if !(metadata_version.starts_with("1.") || metadata_version.starts_with("2.")) {
return Err(Error::InvalidWheel(format!(
"Metadata-Version field has unsupported value {metadata_version}"
)));
}
let name = headers
.get_first_value("Name")
.ok_or(Error::InvalidWheel(format!(
"No Name field in {}",
metadata_file.display()
)))?;
let version = headers
.get_first_value("Version")
.ok_or(Error::InvalidWheel(format!(
"No Version field in {}",
metadata_file.display()
)))?;
Ok((name, version))
Ok(fs::read(metadata_file)?)
}
/// Parses the `entry_points.txt` entry in the wheel for console scripts

View file

@ -901,7 +901,7 @@ pub fn install_wheel(
compile: bool,
check_hashes: bool,
// initially used to the console scripts, currently unused. Keeping it because we likely need
// it for validation later
// it for validation later.
_extras: &[String],
sys_executable: impl AsRef<Path>,
) -> Result<String, Error> {
@ -936,8 +936,8 @@ pub fn install_wheel(
let dist_info_prefix = find_dist_info(filename, archive.file_names().map(|name| (name, name)))?
.1
.to_string();
let (name, _version) = read_metadata(&dist_info_prefix, &mut archive)?;
// TODO: Check that name and version match
let metadata = dist_info_metadata(&dist_info_prefix, &mut archive)?;
let (name, _version) = parse_metadata(&dist_info_prefix, &metadata)?;
let record_path = format!("{dist_info_prefix}.dist-info/RECORD");
let mut record = read_record_file(&mut archive.by_name(&record_path).map_err(|err| {
@ -1037,49 +1037,59 @@ pub fn install_wheel(
Ok(filename.get_tag())
}
/// <https://github.com/PyO3/python-pkginfo-rs>
fn read_metadata(
/// Read the `dist-info` metadata from a wheel archive.
fn dist_info_metadata(
dist_info_prefix: &str,
archive: &mut ZipArchive<impl Read + Seek + Sized>,
) -> Result<(String, String), Error> {
) -> Result<Vec<u8>, Error> {
let mut content = Vec::new();
let metadata_file = format!("{dist_info_prefix}.dist-info/METADATA");
let dist_info_file = format!("{dist_info_prefix}.dist-info/DIST-INFO");
archive
.by_name(&metadata_file)
.map_err(|err| {
let file = metadata_file.to_string();
Error::Zip(file, err)
})?
.by_name(&dist_info_file)
.map_err(|err| Error::Zip(dist_info_file.clone(), err))?
.read_to_end(&mut content)?;
Ok(content)
}
/// Parse the distribution name and version from a wheel's `dist-info` metadata.
///
/// See: <https://github.com/PyO3/python-pkginfo-rs>
pub(crate) fn parse_metadata(
dist_info_prefix: &str,
content: &[u8],
) -> Result<(String, String), Error> {
// HACK: trick mailparse to parse as UTF-8 instead of ASCII
let mut mail = b"Content-Type: text/plain; charset=utf-8\n".to_vec();
mail.extend_from_slice(&content);
let msg = mailparse::parse_mail(&mail)
.map_err(|err| Error::InvalidWheel(format!("Invalid {metadata_file}: {err}")))?;
mail.extend_from_slice(content);
let msg = mailparse::parse_mail(&mail).map_err(|err| {
Error::InvalidWheel(format!(
"Invalid metadata in {dist_info_prefix}.dist-info/METADATA: {err}"
))
})?;
let headers = msg.get_headers();
let metadata_version =
headers
.get_first_value("Metadata-Version")
.ok_or(Error::InvalidWheel(format!(
"No Metadata-Version field in {metadata_file}"
"No `Metadata-Version` field in: {dist_info_prefix}.dist-info/METADATA"
)))?;
// Crude but it should do https://packaging.python.org/en/latest/specifications/core-metadata/#metadata-version
// At time of writing:
// > Version of the file format; legal values are “1.0”, “1.1”, “1.2”, “2.1”, “2.2”, and “2.3”.
if !(metadata_version.starts_with("1.") || metadata_version.starts_with("2.")) {
return Err(Error::InvalidWheel(format!(
"Metadata-Version field has unsupported value {metadata_version}"
"`Metadata-Version` field has unsupported value {metadata_version} in: {dist_info_prefix}.dist-info/METADATA"
)));
}
let name = headers
.get_first_value("Name")
.ok_or(Error::InvalidWheel(format!(
"No Name field in {metadata_file}"
"No `Name` field in: {dist_info_prefix}.dist-info/METADATA"
)))?;
let version = headers
.get_first_value("Version")
.ok_or(Error::InvalidWheel(format!(
"No Version field in {metadata_file}"
"No `Version` field in: {dist_info_prefix}.dist-info/METADATA"
)))?;
Ok((name, version))
}

View file

@ -5,7 +5,7 @@ use zip::ZipArchive;
use distribution_filename::WheelFilename;
use distribution_types::{Dist, SourceDist};
use install_wheel_rs::find_dist_info;
use install_wheel_rs::read_dist_info;
use pypi_types::Metadata21;
use crate::error::Error;
@ -106,68 +106,14 @@ impl std::fmt::Display for SourceDistDownload {
}
}
impl InMemoryWheel {
/// Read the [`Metadata21`] from a wheel.
pub fn read_dist_info(&self) -> Result<Metadata21, Error> {
let mut archive = ZipArchive::new(std::io::Cursor::new(&self.buffer))?;
let dist_info_dir = find_dist_info(
&self.filename,
archive.file_names().map(|name| (name, name)),
)
.map_err(|err| {
Error::DistInfo(Box::new(self.filename.clone()), self.dist.to_string(), err)
})?
.1;
let dist_info =
std::io::read_to_string(archive.by_name(&format!("{dist_info_dir}/METADATA"))?)?;
Ok(Metadata21::parse(dist_info.as_bytes())?)
}
}
impl DiskWheel {
/// Read the [`Metadata21`] from a wheel.
pub fn read_dist_info(&self) -> Result<Metadata21, Error> {
let mut archive = ZipArchive::new(fs_err::File::open(&self.path)?)?;
let dist_info_dir = find_dist_info(
&self.filename,
archive.file_names().map(|name| (name, name)),
)
.map_err(|err| {
Error::DistInfo(Box::new(self.filename.clone()), self.dist.to_string(), err)
})?
.1;
let dist_info =
std::io::read_to_string(archive.by_name(&format!("{dist_info_dir}/METADATA"))?)?;
Ok(Metadata21::parse(dist_info.as_bytes())?)
}
}
impl BuiltWheel {
/// Read the [`Metadata21`] from a wheel.
pub fn read_dist_info(&self) -> Result<Metadata21, Error> {
let mut archive = ZipArchive::new(fs_err::File::open(&self.path)?)?;
let dist_info_dir = find_dist_info(
&self.filename,
archive.file_names().map(|name| (name, name)),
)
.map_err(|err| {
let dist_info = read_dist_info(&self.filename, &mut archive).map_err(|err| {
Error::DistInfo(Box::new(self.filename.clone()), self.dist.to_string(), err)
})?
.1;
let dist_info =
std::io::read_to_string(archive.by_name(&format!("{dist_info_dir}/METADATA"))?)?;
Ok(Metadata21::parse(dist_info.as_bytes())?)
}
}
impl LocalWheel {
/// Read the [`Metadata21`] from a wheel.
pub fn read_dist_info(&self) -> Result<Metadata21, Error> {
match self {
LocalWheel::InMemory(wheel) => wheel.read_dist_info(),
LocalWheel::Disk(wheel) => wheel.read_dist_info(),
LocalWheel::Built(wheel) => wheel.read_dist_info(),
}
})?;
Ok(Metadata21::parse(&dist_info)?)
}
}

View file

@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
use anyhow::bail;
use anyhow::Result;
use fs_err::tokio as fs;
use futures::TryStreamExt;
use fxhash::FxHashMap;
@ -16,12 +16,13 @@ use tokio::task::JoinError;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{debug, info_span};
use url::Url;
use zip::result::ZipError;
use zip::ZipArchive;
use distribution_filename::{WheelFilename, WheelFilenameError};
use distribution_types::direct_url::{DirectArchiveUrl, DirectGitUrl};
use distribution_types::{Dist, GitSourceDist, Identifier, RemoteSource, SourceDist};
use install_wheel_rs::find_dist_info;
use distribution_types::{GitSourceDist, Identifier, RemoteSource, SourceDist};
use install_wheel_rs::read_dist_info;
use platform_tags::Tags;
use puffin_cache::{digest, CacheBucket, CacheEntry, CanonicalUrl, WheelCache};
use puffin_client::{CachedClient, CachedClientError, DataWithCachePolicy};
@ -30,13 +31,15 @@ use puffin_normalize::PackageName;
use puffin_traits::BuildContext;
use pypi_types::Metadata21;
use crate::download::BuiltWheel;
use crate::locks::LockedFile;
use crate::{Download, Reporter, SourceDistDownload};
/// The caller is responsible for adding the source dist information to the error chain
#[derive(Debug, Error)]
pub enum SourceDistError {
#[error("Building source distributions is disabled")]
NoBuild,
// Network error
#[error("Failed to parse URL: `{0}`")]
UrlParse(String, #[source] url::ParseError),
@ -54,7 +57,7 @@ pub enum SourceDistError {
Serde(#[from] serde_json::Error),
// Build error
#[error("Failed to build {0}")]
#[error("Failed to build: {0}")]
Build(Box<SourceDist>, #[source] anyhow::Error),
#[error("Built wheel has an invalid filename")]
WheelFilename(#[from] WheelFilenameError),
@ -64,9 +67,13 @@ pub enum SourceDistError {
metadata: PackageName,
},
#[error("Failed to parse metadata from built wheel")]
Metadata(#[from] crate::error::Error),
Metadata(#[from] pypi_types::Error),
#[error("Failed to read `dist-info` metadata from built wheel")]
DistInfo(#[from] install_wheel_rs::Error),
#[error("Failed to read zip archive from built wheel")]
Zip(#[from] ZipError),
/// Should not occur, i've only seen it when another task panicked
/// Should not occur; only seen when another task panicked.
#[error("The task executor is broken, did some other task panic?")]
Join(#[from] JoinError),
}
@ -192,18 +199,11 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
// Read the metadata from the wheel.
let filename = WheelFilename::from_str(&disk_filename)?;
// TODO(konstin): Remove duplicated `.dist-info` read.
// See: https://github.com/astral-sh/puffin/issues/484
let metadata = BuiltWheel {
dist: Dist::Source(source_dist.clone()),
filename: filename.clone(),
path: wheel_dir.join(&disk_filename),
}
.read_dist_info()?;
let path = wheel_dir.join(disk_filename);
let metadata = read_metadata(&filename, &path)?;
BuiltWheelMetadata {
path: wheel_dir.join(disk_filename),
path,
filename,
metadata,
}
@ -263,8 +263,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
download.subdirectory.as_deref(),
&cache_entry,
)
.await
.map_err(|err| SourceDistError::Build(Box::new(source_dist.clone()), err))?;
.await?;
if let Some(task) = task {
if let Some(reporter) = self.reporter.as_ref() {
@ -328,8 +327,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
subdirectory,
&cache_entry,
)
.await
.map_err(|err| SourceDistError::Build(Box::new(source_dist.clone()), err))?;
.await?;
if let Some(task) = task {
if let Some(reporter) = self.reporter.as_ref() {
reporter.on_build_complete(source_dist, task);
@ -415,8 +413,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
subdirectory.as_deref(),
&cache_entry,
)
.await
.map_err(|err| SourceDistError::Build(Box::new(source_dist.clone()), err))?;
.await?;
if metadata.name != git_source_dist.name {
return Err(SourceDistError::NameMismatch {
@ -507,11 +504,11 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
source_dist: &Path,
subdirectory: Option<&Path>,
cache_entry: &CacheEntry,
) -> anyhow::Result<(String, WheelFilename, Metadata21)> {
) -> Result<(String, WheelFilename, Metadata21), SourceDistError> {
debug!("Building: {dist}");
if self.build_context.no_build() {
bail!("Building source distributions is disabled");
return Err(SourceDistError::NoBuild);
}
// Build the wheel.
@ -524,7 +521,8 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
&cache_entry.dir,
&dist.to_string(),
)
.await?;
.await
.map_err(|err| SourceDistError::Build(Box::new(dist.clone()), err))?;
if let Some(temp_dir) = temp_dir {
temp_dir.close()?;
@ -532,20 +530,23 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
// Read the metadata from the wheel.
let filename = WheelFilename::from_str(&disk_filename)?;
let mut archive =
ZipArchive::new(fs_err::File::open(cache_entry.dir.join(&disk_filename))?)?;
let dist_info_dir =
find_dist_info(&filename, archive.file_names().map(|name| (name, name)))?.1;
let dist_info =
std::io::read_to_string(archive.by_name(&format!("{dist_info_dir}/METADATA"))?)?;
let metadata = Metadata21::parse(dist_info.as_bytes())?;
let metadata = read_metadata(&filename, cache_entry.dir.join(&disk_filename))?;
debug!("Finished building: {dist}");
Ok((disk_filename, filename, metadata))
}
}
/// Read the [`Metadata21`] from a built wheel.
fn read_metadata(
filename: &WheelFilename,
wheel: impl Into<PathBuf>,
) -> Result<Metadata21, SourceDistError> {
let mut archive = ZipArchive::new(fs_err::File::open(wheel)?)?;
let dist_info = read_dist_info(filename, &mut archive)?;
Ok(Metadata21::parse(&dist_info)?)
}
trait SourceDistReporter: Send + Sync {
/// Callback to invoke when a repository checkout begins.
fn on_checkout_start(&self, url: &Url, rev: &str) -> usize;