use std::collections::btree_map::Entry; use std::collections::BTreeMap; use std::path::PathBuf; use reqwest::Response; use rustc_hash::FxHashMap; use tracing::{debug, info_span, instrument, warn, Instrument}; use url::Url; use distribution_filename::DistFilename; use distribution_types::{ BuiltDist, Dist, File, FileLocation, FlatIndexLocation, IndexUrl, PrioritizedDistribution, RegistryBuiltDist, RegistrySourceDist, SourceDist, }; use pep440_rs::Version; use pep508_rs::VerbatimUrl; use platform_tags::Tags; use puffin_cache::{Cache, CacheBucket}; use puffin_normalize::PackageName; use pypi_types::Hashes; use crate::html::SimpleHtml; use crate::{Error, RegistryClient}; #[derive(Debug, thiserror::Error)] pub enum FlatIndexError { #[error("Failed to read `--find-links` directory: {0}")] FindLinksDirectory(PathBuf, #[source] std::io::Error), #[error("Failed to read `--find-links` URL: {0}")] FindLinksUrl(Url, #[source] Error), } type FlatIndexEntry = (DistFilename, File, IndexUrl); /// A client for reading distributions from `--find-links` entries (either local directories or /// remote HTML indexes). #[derive(Debug, Clone)] pub struct FlatIndexClient<'a> { client: &'a RegistryClient, cache: &'a Cache, } impl<'a> FlatIndexClient<'a> { /// Create a new [`FlatIndexClient`]. pub fn new(client: &'a RegistryClient, cache: &'a Cache) -> Self { Self { client, cache } } /// Read the directories and flat remote indexes from `--find-links`. #[allow(clippy::result_large_err)] pub async fn fetch( &self, indexes: impl Iterator, ) -> Result, FlatIndexError> { let mut dists = Vec::new(); // TODO(konstin): Parallelize reads over flat indexes. for flat_index in indexes { let index_dists = match flat_index { FlatIndexLocation::Path(path) => Self::read_from_directory(path) .map_err(|err| FlatIndexError::FindLinksDirectory(path.clone(), err))?, FlatIndexLocation::Url(url) => self .read_from_url(url) .await .map_err(|err| FlatIndexError::FindLinksUrl(url.clone(), err))?, }; if index_dists.is_empty() { warn!("No packages found in `--find-links` entry: {}", flat_index); } else { debug!( "Found {} package{} in `--find-links` entry: {}", index_dists.len(), if index_dists.len() == 1 { "" } else { "s" }, flat_index ); } dists.extend(index_dists); } Ok(dists) } /// Read a flat remote index from a `--find-links` URL. async fn read_from_url(&self, url: &Url) -> Result, Error> { let cache_entry = self.cache.entry( CacheBucket::FlatIndex, "html", format!("{}.msgpack", cache_key::digest(&url.to_string())), ); let cached_client = self.client.cached_client(); let flat_index_request = cached_client .uncached() .get(url.clone()) .header("Accept-Encoding", "gzip") .header("Accept", "text/html") .build()?; let parse_simple_response = |response: Response| { async { let text = response.text().await?; let SimpleHtml { base, files } = SimpleHtml::parse(&text, url) .map_err(|err| Error::from_html_err(err, url.clone()))?; let files: Vec = files .into_iter() .filter_map(|file| { match File::try_from(file, &base) { Ok(file) => Some(file), Err(err) => { // Ignore files with unparseable version specifiers. warn!("Skipping file in {url}: {err}"); None } } }) .collect(); Ok(files) } .instrument(info_span!("parse_flat_index_html", url = % url)) }; let files = cached_client .get_cached_with_callback(flat_index_request, &cache_entry, parse_simple_response) .await?; Ok(files .into_iter() .filter_map(|file| { Some(( DistFilename::try_from_normalized_filename(&file.filename)?, file, IndexUrl::Url(url.clone()), )) }) .collect()) } /// Read a flat remote index from a `--find-links` directory. fn read_from_directory(path: &PathBuf) -> Result, std::io::Error> { // Absolute paths are required for the URL conversion. let path = fs_err::canonicalize(path)?; let url = Url::from_directory_path(&path).expect("URL is already absolute"); let url = VerbatimUrl::unknown(url); let mut dists = Vec::new(); for entry in fs_err::read_dir(&path)? { let entry = entry?; let metadata = entry.metadata()?; if !metadata.is_file() { continue; } let Ok(filename) = entry.file_name().into_string() else { warn!( "Skipping non-UTF-8 filename in `--find-links` directory: {}", entry.file_name().to_string_lossy() ); continue; }; let file = File { dist_info_metadata: None, filename: filename.to_string(), hashes: Hashes { sha256: None }, requires_python: None, size: None, upload_time: None, url: FileLocation::Path(entry.path().to_path_buf(), url.clone()), yanked: None, }; let Some(filename) = DistFilename::try_from_normalized_filename(&filename) else { debug!( "Ignoring `--find-links` entry (expected a wheel or source distribution filename): {}", entry.path().display() ); continue; }; dists.push((filename, file, IndexUrl::Pypi)); } Ok(dists) } } /// A set of [`PrioritizedDistribution`] from a `--find-links` entry, indexed by [`PackageName`] /// and [`Version`]. #[derive(Debug, Clone, Default)] pub struct FlatIndex(FxHashMap); impl FlatIndex { /// Collect all files from a `--find-links` target into a [`FlatIndex`]. #[instrument(skip_all)] pub fn from_entries(entries: Vec, tags: &Tags) -> Self { let mut flat_index = FxHashMap::default(); // Collect compatible distributions. for (filename, file, index) in entries { let distributions = flat_index.entry(filename.name().clone()).or_default(); Self::add_file(distributions, file, filename, tags, index); } Self(flat_index) } fn add_file( distributions: &mut FlatDistributions, file: File, filename: DistFilename, tags: &Tags, index: IndexUrl, ) { // No `requires-python` here: for source distributions, we don't have that information; // for wheels, we read it lazily only when selected. match filename { DistFilename::WheelFilename(filename) => { let priority = filename.compatibility(tags); let version = filename.version.clone(); let dist = Dist::Built(BuiltDist::Registry(RegistryBuiltDist { filename, file, index, })); match distributions.0.entry(version) { Entry::Occupied(mut entry) => { entry.get_mut().insert_built(dist, None, None, priority); } Entry::Vacant(entry) => { entry.insert(PrioritizedDistribution::from_built( dist, None, None, priority, )); } } } DistFilename::SourceDistFilename(filename) => { let dist = Dist::Source(SourceDist::Registry(RegistrySourceDist { filename: filename.clone(), file, index, })); match distributions.0.entry(filename.version.clone()) { Entry::Occupied(mut entry) => { entry.get_mut().insert_source(dist, None, None); } Entry::Vacant(entry) => { entry.insert(PrioritizedDistribution::from_source(dist, None, None)); } } } } } /// Get the [`FlatDistributions`] for the given package name. pub fn get(&self, package_name: &PackageName) -> Option<&FlatDistributions> { self.0.get(package_name) } } /// A set of [`PrioritizedDistribution`] from a `--find-links` entry for a single package, indexed /// by [`Version`]. #[derive(Debug, Clone, Default)] pub struct FlatDistributions(BTreeMap); impl FlatDistributions { pub fn iter(&self) -> impl Iterator { self.0.iter() } } impl From for BTreeMap { fn from(distributions: FlatDistributions) -> Self { distributions.0 } }