From e23292641fd4e566c8838c95d1143d448c5bf28c Mon Sep 17 00:00:00 2001 From: konsti Date: Sun, 24 Dec 2023 19:31:52 +0100 Subject: [PATCH] Add pypi 10k packages with most dependents dataset (#711) From manual inspection, this dataset generated through the [libraries.io API](https://libraries.io/api#project-search) seems more mainstream than the current 8k one, which is also preserved. I've added the dataset to the repo because the API requires an API key. --- CONTRIBUTING.md | 2 +- Cargo.lock | 4 + crates/puffin-dev/Cargo.toml | 4 + crates/puffin-dev/src/install_many.rs | 161 ++++++++++++++++++ crates/puffin-dev/src/main.rs | 14 +- .../src/index/registry_wheel_index.rs | 18 +- crates/puffin-installer/src/downloader.rs | 44 +++-- scripts/popular_packages/.gitignore | 2 + .../pypi_10k_most_dependents.ipynb | 86 ++++++++++ .../pypi_8k_downloads.sh} | 2 +- scripts/resolve/.gitignore | 1 - 11 files changed, 315 insertions(+), 23 deletions(-) create mode 100644 crates/puffin-dev/src/install_many.rs create mode 100644 scripts/popular_packages/.gitignore create mode 100644 scripts/popular_packages/pypi_10k_most_dependents.ipynb rename scripts/{resolve/get_pypi_top_8k.sh => popular_packages/pypi_8k_downloads.sh} (54%) delete mode 100644 scripts/resolve/.gitignore diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3f2118f42..2d89a6e8d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,7 +8,7 @@ Source distributions can run arbitrary code on build and can make unwanted modif docker buildx build -t puffin-builder -f builder.dockerfile . # Build for musl to avoid glibc errors, might not be required with your OS version cargo build --target x86_64-unknown-linux-musl -docker run --rm -it -v $(pwd):/app puffin-builder /app/target/x86_64-unknown-linux-musl/debug/puffin-dev resolve-many --cache-dir /app/cache-docker /app/scripts/resolve/pypi_top_8k_flat.txt +docker run --rm -it -v $(pwd):/app puffin-builder /app/target/x86_64-unknown-linux-musl/debug/puffin-dev resolve-many --cache-dir /app/cache-docker /app/scripts/popular_packages/pypi_10k_most_dependents.txt ``` We recommend using this container if you don't trust the dependency tree of the package(s) you are trying to resolve or install. diff --git a/Cargo.lock b/Cargo.lock index 142bef19d..0a28d0784 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2412,6 +2412,7 @@ dependencies = [ "futures", "gourgeist", "indicatif", + "install-wheel-rs", "itertools 0.11.0", "mimalloc", "pep508_rs", @@ -2422,11 +2423,14 @@ dependencies = [ "puffin-cache", "puffin-client", "puffin-dispatch", + "puffin-distribution", + "puffin-installer", "puffin-interpreter", "puffin-normalize", "puffin-resolver", "puffin-traits", "pypi-types", + "rustc-hash", "tempfile", "tikv-jemallocator", "tokio", diff --git a/crates/puffin-dev/Cargo.toml b/crates/puffin-dev/Cargo.toml index 3f3d8c5e0..fceaba119 100644 --- a/crates/puffin-dev/Cargo.toml +++ b/crates/puffin-dev/Cargo.toml @@ -17,6 +17,7 @@ workspace = true distribution-filename = { path = "../distribution-filename" } distribution-types = { path = "../distribution-types" } gourgeist = { path = "../gourgeist" } +install-wheel-rs = { path = "../install-wheel-rs" } pep508_rs = { path = "../pep508-rs" } platform-host = { path = "../platform-host" } platform-tags = { path = "../platform-tags" } @@ -24,6 +25,8 @@ puffin-build = { path = "../puffin-build" } puffin-cache = { path = "../puffin-cache", features = ["clap"] } puffin-client = { path = "../puffin-client" } puffin-dispatch = { path = "../puffin-dispatch" } +puffin-distribution = { path = "../puffin-distribution" } +puffin-installer = { path = "../puffin-installer" } puffin-interpreter = { path = "../puffin-interpreter" } puffin-normalize = { path = "../puffin-normalize" } puffin-resolver = { path = "../puffin-resolver" } @@ -40,6 +43,7 @@ futures = { workspace = true } indicatif = { workspace = true } itertools = { workspace = true } petgraph = { workspace = true } +rustc-hash = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } diff --git a/crates/puffin-dev/src/install_many.rs b/crates/puffin-dev/src/install_many.rs new file mode 100644 index 000000000..c3fbce827 --- /dev/null +++ b/crates/puffin-dev/src/install_many.rs @@ -0,0 +1,161 @@ +use std::iter::Iterator; +use std::path::PathBuf; +use std::str::FromStr; + +use anyhow::{Context, Result}; +use clap::Parser; +use futures::StreamExt; +use itertools::{Either, Itertools}; +use rustc_hash::FxHashMap; +use tracing::info; + +use distribution_types::{CachedDist, Dist, DistributionMetadata, Name, Resolution, VersionOrUrl}; +use install_wheel_rs::linker::LinkMode; +use pep508_rs::Requirement; +use platform_host::Platform; +use platform_tags::Tags; +use puffin_cache::{Cache, CacheArgs}; +use puffin_client::{RegistryClient, RegistryClientBuilder}; +use puffin_dispatch::BuildDispatch; +use puffin_distribution::RegistryWheelIndex; +use puffin_installer::Downloader; +use puffin_interpreter::Virtualenv; +use puffin_normalize::PackageName; +use puffin_resolver::DistFinder; +use puffin_traits::{BuildContext, OnceMap}; +use pypi_types::IndexUrls; + +#[derive(Parser)] +pub(crate) struct InstallManyArgs { + /// Path to a file containing one requirement per line. + requirements: PathBuf, + #[clap(long)] + limit: Option, + /// Don't build source distributions. This means resolving will not run arbitrary code. The + /// cached wheels of already built source distributions will be reused. + #[clap(long)] + no_build: bool, + /// Run this many tasks in parallel + #[clap(long, default_value = "50")] + num_tasks: usize, + #[command(flatten)] + cache_args: CacheArgs, +} + +pub(crate) async fn install_many(args: InstallManyArgs) -> Result<()> { + let data = fs_err::read_to_string(&args.requirements)?; + + let lines = data.lines().map(Requirement::from_str); + let requirements: Vec = if let Some(limit) = args.limit { + lines.take(limit).collect::>()? + } else { + lines.collect::>()? + }; + info!("Got {} requirements", requirements.len()); + + let cache = Cache::try_from(args.cache_args)?; + let platform = Platform::current()?; + let venv = Virtualenv::from_env(platform, &cache)?; + let client = RegistryClientBuilder::new(cache.clone()).build(); + let index_urls = IndexUrls::default(); + let tags = Tags::from_interpreter(venv.interpreter())?; + let build_dispatch = BuildDispatch::new( + &client, + &cache, + venv.interpreter(), + &index_urls, + venv.python_executable(), + args.no_build, + ); + + for (idx, requirements) in requirements.chunks(100).enumerate() { + info!("Chunk {idx}"); + install_chunk( + requirements, + &build_dispatch, + &tags, + &client, + &venv, + &index_urls, + ) + .await?; + } + + Ok(()) +} + +async fn install_chunk( + requirements: &[Requirement], + build_dispatch: &BuildDispatch<'_>, + tags: &Tags, + client: &RegistryClient, + venv: &Virtualenv, + index_urls: &IndexUrls, +) -> Result<()> { + let resolution: Vec<_> = DistFinder::new(tags, client, venv.interpreter()) + .resolve_stream(requirements) + .collect() + .await; + let (resolution, failures): (FxHashMap, Vec<_>) = + resolution.into_iter().partition_result(); + for failure in &failures { + info!("Failed to find wheel: {failure}"); + } + info!("Failed to find {} wheel(s)", failures.len()); + let wheels_and_source_dist = resolution.len(); + let resolution = if build_dispatch.no_build() { + let only_wheels: FxHashMap<_, _> = resolution + .into_iter() + .filter(|(_, dist)| match dist { + Dist::Built(_) => true, + Dist::Source(_) => false, + }) + .collect(); + info!( + "Removed {} source dists", + wheels_and_source_dist - only_wheels.len() + ); + only_wheels + } else { + resolution + }; + let dists = Resolution::new(resolution) + .into_distributions() + .collect::>(); + + let mut registry_index = RegistryWheelIndex::new(build_dispatch.cache(), tags, index_urls); + let (cached, uncached): (Vec<_>, Vec<_>) = dists.into_iter().partition_map(|dist| { + // We always want the wheel for the latest version not whatever matching is in cache + let VersionOrUrl::Version(version) = dist.version_or_url() else { + unreachable!(); + }; + + if let Some(cached) = registry_index.get_version(dist.name(), version) { + Either::Left(CachedDist::Registry(cached.clone())) + } else { + Either::Right(dist) + } + }); + info!("Cached: {}, Uncached {}", cached.len(), uncached.len()); + + let downloader = Downloader::new(build_dispatch.cache(), tags, client, build_dispatch); + let in_flight = OnceMap::default(); + let fetches: Vec<_> = futures::stream::iter(uncached) + .map(|dist| downloader.get_wheel(dist, &in_flight)) + .buffer_unordered(50) + .collect() + .await; + let (wheels, failures): (Vec<_>, Vec<_>) = fetches.into_iter().partition_result(); + for failure in &failures { + info!("Failed to fetch wheel: {failure}"); + } + info!("Failed to fetch {} wheel(s)", failures.len()); + + let wheels: Vec<_> = wheels.into_iter().chain(cached).collect(); + puffin_installer::Installer::new(venv) + .with_link_mode(LinkMode::default()) + .install(&wheels) + .context("Failed to install")?; + info!("Installed {} wheels", wheels.len()); + Ok(()) +} diff --git a/crates/puffin-dev/src/main.rs b/crates/puffin-dev/src/main.rs index a701a86c7..e0e9eb46c 100644 --- a/crates/puffin-dev/src/main.rs +++ b/crates/puffin-dev/src/main.rs @@ -15,6 +15,7 @@ use tracing_subscriber::EnvFilter; use resolve_many::ResolveManyArgs; use crate::build::{build, BuildArgs}; +use crate::install_many::InstallManyArgs; use crate::resolve_cli::ResolveCliArgs; use crate::wheel_metadata::WheelMetadataArgs; @@ -35,6 +36,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; mod build; +mod install_many; mod resolve_cli; mod resolve_many; mod wheel_metadata; @@ -45,11 +47,16 @@ enum Cli { Build(BuildArgs), /// Resolve many requirements independently in parallel and report failures and sucesses. /// - /// Run `scripts/resolve/get_pypi_top_8k.sh` once, then + /// Run `scripts/popular_packages/pypi_8k_downloads.sh` once, then /// ```bash - /// cargo run --bin puffin-dev -- resolve-many scripts/resolve/pypi_top_8k_flat.txt + /// cargo run --bin puffin-dev -- resolve-many scripts/popular_packages/pypi_8k_downloads.txt + /// ``` + /// or + /// ```bash + /// cargo run --bin puffin-dev -- resolve-many scripts/popular_packages/pypi_10k_most_dependents.txt /// ``` ResolveMany(ResolveManyArgs), + InstallMany(InstallManyArgs), /// Resolve requirements passed on the CLI ResolveCli(ResolveCliArgs), WheelMetadata(WheelMetadataArgs), @@ -65,6 +72,9 @@ async fn run() -> Result<()> { Cli::ResolveMany(args) => { resolve_many::resolve_many(args).await?; } + Cli::InstallMany(args) => { + install_many::install_many(args).await?; + } Cli::ResolveCli(args) => { resolve_cli::resolve_cli(args).await?; } diff --git a/crates/puffin-distribution/src/index/registry_wheel_index.rs b/crates/puffin-distribution/src/index/registry_wheel_index.rs index a86a12413..bded698c3 100644 --- a/crates/puffin-distribution/src/index/registry_wheel_index.rs +++ b/crates/puffin-distribution/src/index/registry_wheel_index.rs @@ -41,13 +41,29 @@ impl<'a> RegistryWheelIndex<'a> { &mut self, name: &PackageName, ) -> impl Iterator { + self.get_impl(name).iter().rev() + } + + /// Get the best wheel for the given package name and version. + /// + /// If the package is not yet indexed, this will index the package by reading from the cache. + pub fn get_version( + &mut self, + name: &PackageName, + version: &Version, + ) -> Option<&CachedRegistryDist> { + self.get_impl(name).get(version) + } + + /// Get an entry in the index. + fn get_impl(&mut self, name: &PackageName) -> &BTreeMap { let versions = match self.index.entry(name.clone()) { Entry::Occupied(entry) => entry.into_mut(), Entry::Vacant(entry) => { entry.insert(Self::index(name, self.cache, self.tags, self.index_urls)) } }; - versions.iter().rev() + versions } /// Add a package to the index by reading from the cache. diff --git a/crates/puffin-installer/src/downloader.rs b/crates/puffin-installer/src/downloader.rs index 475da035a..32c1720be 100644 --- a/crates/puffin-installer/src/downloader.rs +++ b/crates/puffin-installer/src/downloader.rs @@ -2,7 +2,7 @@ use std::cmp::Reverse; use std::path::{Path, PathBuf}; use std::sync::Arc; -use futures::{StreamExt, TryFutureExt}; +use futures::{Stream, StreamExt, TryFutureExt, TryStreamExt}; use tokio::task::JoinError; use tracing::{instrument, warn}; use url::Url; @@ -60,8 +60,27 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> { } } + /// Fetch, build, and unzip the distributions in parallel. + #[instrument(name = "download_distributions", skip_all, fields(total = distributions.len()))] + pub fn download_stream<'stream>( + &'stream self, + distributions: Vec, + in_flight: &'stream OnceMap>, + ) -> impl Stream> + 'stream { + futures::stream::iter(distributions) + .map(|dist| async { + let wheel = self.get_wheel(dist, in_flight).await?; + if let Some(reporter) = self.reporter.as_ref() { + reporter.on_progress(&wheel); + } + Ok::(wheel) + }) + // TODO(charlie): The number of concurrent fetches, such that we limit the number of + // concurrent builds to the number of cores, while allowing more concurrent downloads. + .buffer_unordered(50) + } + /// Download, build, and unzip a set of downloaded wheels. - #[instrument(skip_all)] pub async fn download( &self, mut distributions: Vec, @@ -72,20 +91,10 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> { Reverse(distribution.size().unwrap_or(usize::MAX)) }); - // Fetch, build, and unzip the distributions in parallel. - // TODO(charlie): The number of concurrent fetches, such that we limit the number of - // concurrent builds to the number of cores, while allowing more concurrent downloads. - let mut wheels = Vec::with_capacity(distributions.len()); - let mut fetches = futures::stream::iter(distributions) - .map(|dist| self.get_wheel(dist, in_flight)) - .buffer_unordered(50); - - while let Some(wheel) = fetches.next().await.transpose()? { - if let Some(reporter) = self.reporter.as_ref() { - reporter.on_progress(&wheel); - } - wheels.push(wheel); - } + let wheels = self + .download_stream(distributions, in_flight) + .try_collect() + .await?; if let Some(reporter) = self.reporter.as_ref() { reporter.on_complete(); @@ -143,7 +152,8 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> { } /// Download, build, and unzip a single wheel. - async fn get_wheel( + #[instrument(skip_all, fields(name = %dist, url = dist.file().unwrap().url))] + pub async fn get_wheel( &self, dist: Dist, in_flight: &OnceMap>, diff --git a/scripts/popular_packages/.gitignore b/scripts/popular_packages/.gitignore new file mode 100644 index 000000000..4acf3cfdc --- /dev/null +++ b/scripts/popular_packages/.gitignore @@ -0,0 +1,2 @@ +pypi_8k_downloads.txt +pypi_10k_most_dependents.txt diff --git a/scripts/popular_packages/pypi_10k_most_dependents.ipynb b/scripts/popular_packages/pypi_10k_most_dependents.ipynb new file mode 100644 index 000000000..4c521fe3e --- /dev/null +++ b/scripts/popular_packages/pypi_10k_most_dependents.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"To update `pypi_10k_most_dependents.txt`, enter your `api_key` from https://libraries.io/account.\n", + "\n", + "The latest version is available at: https://gist.github.com/charliermarsh/07afd9f543dfea68408a4a42cede4be4.\n", + "\"\"\"\n", + "\n", + "import httpx\n", + "\n", + "from pathlib import Path\n", + "\n", + "api_key = \"\"\n", + "responses = {}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2532bf8c426af5", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "for i in range(100): # 100 pages with 100 per page -> 10k\n", + " print(i)\n", + " if not i in responses:\n", + " # https://libraries.io/api#project-search\n", + " sort = \"dependents_count\"\n", + " url = f\"https://libraries.io/api/search?platforms=Pypi&per_page=100&page={i+1}&sort{sort}&api_key={api_key}\"\n", + " responses[i] = httpx.get(url, timeout=30.0).json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bc80702b6f8ebc3", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "flat_list = []\n", + "for response in responses.values():\n", + " for entry in response:\n", + " flat_list.append(entry[\"name\"])\n", + "print(flat_list)\n", + "Path().parent.joinpath(\"pypi_10k_most_dependents.txt\").write_text(\"\\n\".join(flat_list))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/resolve/get_pypi_top_8k.sh b/scripts/popular_packages/pypi_8k_downloads.sh similarity index 54% rename from scripts/resolve/get_pypi_top_8k.sh rename to scripts/popular_packages/pypi_8k_downloads.sh index 0bd352a0e..bb5a2b605 100755 --- a/scripts/resolve/get_pypi_top_8k.sh +++ b/scripts/popular_packages/pypi_8k_downloads.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash -curl https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json | jq -r ".rows | .[].project" > pypi_top_8k_flat.txt +curl https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json | jq -r ".rows | .[].project" > pypi_8k_downloads.txt diff --git a/scripts/resolve/.gitignore b/scripts/resolve/.gitignore deleted file mode 100644 index ff5a59fdb..000000000 --- a/scripts/resolve/.gitignore +++ /dev/null @@ -1 +0,0 @@ -pypi_top_8k_flat.txt \ No newline at end of file