mirror of
https://github.com/astral-sh/uv.git
synced 2025-07-07 21:35:00 +00:00
Add pypi 10k packages with most dependents dataset (#711)
From manual inspection, this dataset generated through the [libraries.io API](https://libraries.io/api#project-search) seems more mainstream than the current 8k one, which is also preserved. I've added the dataset to the repo because the API requires an API key.
This commit is contained in:
parent
5bce699ee1
commit
e23292641f
11 changed files with 315 additions and 23 deletions
|
@ -8,7 +8,7 @@ Source distributions can run arbitrary code on build and can make unwanted modif
|
|||
docker buildx build -t puffin-builder -f builder.dockerfile .
|
||||
# Build for musl to avoid glibc errors, might not be required with your OS version
|
||||
cargo build --target x86_64-unknown-linux-musl
|
||||
docker run --rm -it -v $(pwd):/app puffin-builder /app/target/x86_64-unknown-linux-musl/debug/puffin-dev resolve-many --cache-dir /app/cache-docker /app/scripts/resolve/pypi_top_8k_flat.txt
|
||||
docker run --rm -it -v $(pwd):/app puffin-builder /app/target/x86_64-unknown-linux-musl/debug/puffin-dev resolve-many --cache-dir /app/cache-docker /app/scripts/popular_packages/pypi_10k_most_dependents.txt
|
||||
```
|
||||
|
||||
We recommend using this container if you don't trust the dependency tree of the package(s) you are trying to resolve or install.
|
||||
|
|
4
Cargo.lock
generated
4
Cargo.lock
generated
|
@ -2412,6 +2412,7 @@ dependencies = [
|
|||
"futures",
|
||||
"gourgeist",
|
||||
"indicatif",
|
||||
"install-wheel-rs",
|
||||
"itertools 0.11.0",
|
||||
"mimalloc",
|
||||
"pep508_rs",
|
||||
|
@ -2422,11 +2423,14 @@ dependencies = [
|
|||
"puffin-cache",
|
||||
"puffin-client",
|
||||
"puffin-dispatch",
|
||||
"puffin-distribution",
|
||||
"puffin-installer",
|
||||
"puffin-interpreter",
|
||||
"puffin-normalize",
|
||||
"puffin-resolver",
|
||||
"puffin-traits",
|
||||
"pypi-types",
|
||||
"rustc-hash",
|
||||
"tempfile",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
|
|
|
@ -17,6 +17,7 @@ workspace = true
|
|||
distribution-filename = { path = "../distribution-filename" }
|
||||
distribution-types = { path = "../distribution-types" }
|
||||
gourgeist = { path = "../gourgeist" }
|
||||
install-wheel-rs = { path = "../install-wheel-rs" }
|
||||
pep508_rs = { path = "../pep508-rs" }
|
||||
platform-host = { path = "../platform-host" }
|
||||
platform-tags = { path = "../platform-tags" }
|
||||
|
@ -24,6 +25,8 @@ puffin-build = { path = "../puffin-build" }
|
|||
puffin-cache = { path = "../puffin-cache", features = ["clap"] }
|
||||
puffin-client = { path = "../puffin-client" }
|
||||
puffin-dispatch = { path = "../puffin-dispatch" }
|
||||
puffin-distribution = { path = "../puffin-distribution" }
|
||||
puffin-installer = { path = "../puffin-installer" }
|
||||
puffin-interpreter = { path = "../puffin-interpreter" }
|
||||
puffin-normalize = { path = "../puffin-normalize" }
|
||||
puffin-resolver = { path = "../puffin-resolver" }
|
||||
|
@ -40,6 +43,7 @@ futures = { workspace = true }
|
|||
indicatif = { workspace = true }
|
||||
itertools = { workspace = true }
|
||||
petgraph = { workspace = true }
|
||||
rustc-hash = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
|
|
161
crates/puffin-dev/src/install_many.rs
Normal file
161
crates/puffin-dev/src/install_many.rs
Normal file
|
@ -0,0 +1,161 @@
|
|||
use std::iter::Iterator;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Parser;
|
||||
use futures::StreamExt;
|
||||
use itertools::{Either, Itertools};
|
||||
use rustc_hash::FxHashMap;
|
||||
use tracing::info;
|
||||
|
||||
use distribution_types::{CachedDist, Dist, DistributionMetadata, Name, Resolution, VersionOrUrl};
|
||||
use install_wheel_rs::linker::LinkMode;
|
||||
use pep508_rs::Requirement;
|
||||
use platform_host::Platform;
|
||||
use platform_tags::Tags;
|
||||
use puffin_cache::{Cache, CacheArgs};
|
||||
use puffin_client::{RegistryClient, RegistryClientBuilder};
|
||||
use puffin_dispatch::BuildDispatch;
|
||||
use puffin_distribution::RegistryWheelIndex;
|
||||
use puffin_installer::Downloader;
|
||||
use puffin_interpreter::Virtualenv;
|
||||
use puffin_normalize::PackageName;
|
||||
use puffin_resolver::DistFinder;
|
||||
use puffin_traits::{BuildContext, OnceMap};
|
||||
use pypi_types::IndexUrls;
|
||||
|
||||
#[derive(Parser)]
|
||||
pub(crate) struct InstallManyArgs {
|
||||
/// Path to a file containing one requirement per line.
|
||||
requirements: PathBuf,
|
||||
#[clap(long)]
|
||||
limit: Option<usize>,
|
||||
/// Don't build source distributions. This means resolving will not run arbitrary code. The
|
||||
/// cached wheels of already built source distributions will be reused.
|
||||
#[clap(long)]
|
||||
no_build: bool,
|
||||
/// Run this many tasks in parallel
|
||||
#[clap(long, default_value = "50")]
|
||||
num_tasks: usize,
|
||||
#[command(flatten)]
|
||||
cache_args: CacheArgs,
|
||||
}
|
||||
|
||||
pub(crate) async fn install_many(args: InstallManyArgs) -> Result<()> {
|
||||
let data = fs_err::read_to_string(&args.requirements)?;
|
||||
|
||||
let lines = data.lines().map(Requirement::from_str);
|
||||
let requirements: Vec<Requirement> = if let Some(limit) = args.limit {
|
||||
lines.take(limit).collect::<Result<_, _>>()?
|
||||
} else {
|
||||
lines.collect::<Result<_, _>>()?
|
||||
};
|
||||
info!("Got {} requirements", requirements.len());
|
||||
|
||||
let cache = Cache::try_from(args.cache_args)?;
|
||||
let platform = Platform::current()?;
|
||||
let venv = Virtualenv::from_env(platform, &cache)?;
|
||||
let client = RegistryClientBuilder::new(cache.clone()).build();
|
||||
let index_urls = IndexUrls::default();
|
||||
let tags = Tags::from_interpreter(venv.interpreter())?;
|
||||
let build_dispatch = BuildDispatch::new(
|
||||
&client,
|
||||
&cache,
|
||||
venv.interpreter(),
|
||||
&index_urls,
|
||||
venv.python_executable(),
|
||||
args.no_build,
|
||||
);
|
||||
|
||||
for (idx, requirements) in requirements.chunks(100).enumerate() {
|
||||
info!("Chunk {idx}");
|
||||
install_chunk(
|
||||
requirements,
|
||||
&build_dispatch,
|
||||
&tags,
|
||||
&client,
|
||||
&venv,
|
||||
&index_urls,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn install_chunk(
|
||||
requirements: &[Requirement],
|
||||
build_dispatch: &BuildDispatch<'_>,
|
||||
tags: &Tags,
|
||||
client: &RegistryClient,
|
||||
venv: &Virtualenv,
|
||||
index_urls: &IndexUrls,
|
||||
) -> Result<()> {
|
||||
let resolution: Vec<_> = DistFinder::new(tags, client, venv.interpreter())
|
||||
.resolve_stream(requirements)
|
||||
.collect()
|
||||
.await;
|
||||
let (resolution, failures): (FxHashMap<PackageName, Dist>, Vec<_>) =
|
||||
resolution.into_iter().partition_result();
|
||||
for failure in &failures {
|
||||
info!("Failed to find wheel: {failure}");
|
||||
}
|
||||
info!("Failed to find {} wheel(s)", failures.len());
|
||||
let wheels_and_source_dist = resolution.len();
|
||||
let resolution = if build_dispatch.no_build() {
|
||||
let only_wheels: FxHashMap<_, _> = resolution
|
||||
.into_iter()
|
||||
.filter(|(_, dist)| match dist {
|
||||
Dist::Built(_) => true,
|
||||
Dist::Source(_) => false,
|
||||
})
|
||||
.collect();
|
||||
info!(
|
||||
"Removed {} source dists",
|
||||
wheels_and_source_dist - only_wheels.len()
|
||||
);
|
||||
only_wheels
|
||||
} else {
|
||||
resolution
|
||||
};
|
||||
let dists = Resolution::new(resolution)
|
||||
.into_distributions()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut registry_index = RegistryWheelIndex::new(build_dispatch.cache(), tags, index_urls);
|
||||
let (cached, uncached): (Vec<_>, Vec<_>) = dists.into_iter().partition_map(|dist| {
|
||||
// We always want the wheel for the latest version not whatever matching is in cache
|
||||
let VersionOrUrl::Version(version) = dist.version_or_url() else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
if let Some(cached) = registry_index.get_version(dist.name(), version) {
|
||||
Either::Left(CachedDist::Registry(cached.clone()))
|
||||
} else {
|
||||
Either::Right(dist)
|
||||
}
|
||||
});
|
||||
info!("Cached: {}, Uncached {}", cached.len(), uncached.len());
|
||||
|
||||
let downloader = Downloader::new(build_dispatch.cache(), tags, client, build_dispatch);
|
||||
let in_flight = OnceMap::default();
|
||||
let fetches: Vec<_> = futures::stream::iter(uncached)
|
||||
.map(|dist| downloader.get_wheel(dist, &in_flight))
|
||||
.buffer_unordered(50)
|
||||
.collect()
|
||||
.await;
|
||||
let (wheels, failures): (Vec<_>, Vec<_>) = fetches.into_iter().partition_result();
|
||||
for failure in &failures {
|
||||
info!("Failed to fetch wheel: {failure}");
|
||||
}
|
||||
info!("Failed to fetch {} wheel(s)", failures.len());
|
||||
|
||||
let wheels: Vec<_> = wheels.into_iter().chain(cached).collect();
|
||||
puffin_installer::Installer::new(venv)
|
||||
.with_link_mode(LinkMode::default())
|
||||
.install(&wheels)
|
||||
.context("Failed to install")?;
|
||||
info!("Installed {} wheels", wheels.len());
|
||||
Ok(())
|
||||
}
|
|
@ -15,6 +15,7 @@ use tracing_subscriber::EnvFilter;
|
|||
use resolve_many::ResolveManyArgs;
|
||||
|
||||
use crate::build::{build, BuildArgs};
|
||||
use crate::install_many::InstallManyArgs;
|
||||
use crate::resolve_cli::ResolveCliArgs;
|
||||
use crate::wheel_metadata::WheelMetadataArgs;
|
||||
|
||||
|
@ -35,6 +36,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
|||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
mod build;
|
||||
mod install_many;
|
||||
mod resolve_cli;
|
||||
mod resolve_many;
|
||||
mod wheel_metadata;
|
||||
|
@ -45,11 +47,16 @@ enum Cli {
|
|||
Build(BuildArgs),
|
||||
/// Resolve many requirements independently in parallel and report failures and sucesses.
|
||||
///
|
||||
/// Run `scripts/resolve/get_pypi_top_8k.sh` once, then
|
||||
/// Run `scripts/popular_packages/pypi_8k_downloads.sh` once, then
|
||||
/// ```bash
|
||||
/// cargo run --bin puffin-dev -- resolve-many scripts/resolve/pypi_top_8k_flat.txt
|
||||
/// cargo run --bin puffin-dev -- resolve-many scripts/popular_packages/pypi_8k_downloads.txt
|
||||
/// ```
|
||||
/// or
|
||||
/// ```bash
|
||||
/// cargo run --bin puffin-dev -- resolve-many scripts/popular_packages/pypi_10k_most_dependents.txt
|
||||
/// ```
|
||||
ResolveMany(ResolveManyArgs),
|
||||
InstallMany(InstallManyArgs),
|
||||
/// Resolve requirements passed on the CLI
|
||||
ResolveCli(ResolveCliArgs),
|
||||
WheelMetadata(WheelMetadataArgs),
|
||||
|
@ -65,6 +72,9 @@ async fn run() -> Result<()> {
|
|||
Cli::ResolveMany(args) => {
|
||||
resolve_many::resolve_many(args).await?;
|
||||
}
|
||||
Cli::InstallMany(args) => {
|
||||
install_many::install_many(args).await?;
|
||||
}
|
||||
Cli::ResolveCli(args) => {
|
||||
resolve_cli::resolve_cli(args).await?;
|
||||
}
|
||||
|
|
|
@ -41,13 +41,29 @@ impl<'a> RegistryWheelIndex<'a> {
|
|||
&mut self,
|
||||
name: &PackageName,
|
||||
) -> impl Iterator<Item = (&Version, &CachedRegistryDist)> {
|
||||
self.get_impl(name).iter().rev()
|
||||
}
|
||||
|
||||
/// Get the best wheel for the given package name and version.
|
||||
///
|
||||
/// If the package is not yet indexed, this will index the package by reading from the cache.
|
||||
pub fn get_version(
|
||||
&mut self,
|
||||
name: &PackageName,
|
||||
version: &Version,
|
||||
) -> Option<&CachedRegistryDist> {
|
||||
self.get_impl(name).get(version)
|
||||
}
|
||||
|
||||
/// Get an entry in the index.
|
||||
fn get_impl(&mut self, name: &PackageName) -> &BTreeMap<Version, CachedRegistryDist> {
|
||||
let versions = match self.index.entry(name.clone()) {
|
||||
Entry::Occupied(entry) => entry.into_mut(),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(Self::index(name, self.cache, self.tags, self.index_urls))
|
||||
}
|
||||
};
|
||||
versions.iter().rev()
|
||||
versions
|
||||
}
|
||||
|
||||
/// Add a package to the index by reading from the cache.
|
||||
|
|
|
@ -2,7 +2,7 @@ use std::cmp::Reverse;
|
|||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::{StreamExt, TryFutureExt};
|
||||
use futures::{Stream, StreamExt, TryFutureExt, TryStreamExt};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::{instrument, warn};
|
||||
use url::Url;
|
||||
|
@ -60,8 +60,27 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Fetch, build, and unzip the distributions in parallel.
|
||||
#[instrument(name = "download_distributions", skip_all, fields(total = distributions.len()))]
|
||||
pub fn download_stream<'stream>(
|
||||
&'stream self,
|
||||
distributions: Vec<Dist>,
|
||||
in_flight: &'stream OnceMap<PathBuf, Result<CachedDist, String>>,
|
||||
) -> impl Stream<Item = Result<CachedDist, Error>> + 'stream {
|
||||
futures::stream::iter(distributions)
|
||||
.map(|dist| async {
|
||||
let wheel = self.get_wheel(dist, in_flight).await?;
|
||||
if let Some(reporter) = self.reporter.as_ref() {
|
||||
reporter.on_progress(&wheel);
|
||||
}
|
||||
Ok::<CachedDist, Error>(wheel)
|
||||
})
|
||||
// TODO(charlie): The number of concurrent fetches, such that we limit the number of
|
||||
// concurrent builds to the number of cores, while allowing more concurrent downloads.
|
||||
.buffer_unordered(50)
|
||||
}
|
||||
|
||||
/// Download, build, and unzip a set of downloaded wheels.
|
||||
#[instrument(skip_all)]
|
||||
pub async fn download(
|
||||
&self,
|
||||
mut distributions: Vec<Dist>,
|
||||
|
@ -72,20 +91,10 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> {
|
|||
Reverse(distribution.size().unwrap_or(usize::MAX))
|
||||
});
|
||||
|
||||
// Fetch, build, and unzip the distributions in parallel.
|
||||
// TODO(charlie): The number of concurrent fetches, such that we limit the number of
|
||||
// concurrent builds to the number of cores, while allowing more concurrent downloads.
|
||||
let mut wheels = Vec::with_capacity(distributions.len());
|
||||
let mut fetches = futures::stream::iter(distributions)
|
||||
.map(|dist| self.get_wheel(dist, in_flight))
|
||||
.buffer_unordered(50);
|
||||
|
||||
while let Some(wheel) = fetches.next().await.transpose()? {
|
||||
if let Some(reporter) = self.reporter.as_ref() {
|
||||
reporter.on_progress(&wheel);
|
||||
}
|
||||
wheels.push(wheel);
|
||||
}
|
||||
let wheels = self
|
||||
.download_stream(distributions, in_flight)
|
||||
.try_collect()
|
||||
.await?;
|
||||
|
||||
if let Some(reporter) = self.reporter.as_ref() {
|
||||
reporter.on_complete();
|
||||
|
@ -143,7 +152,8 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> {
|
|||
}
|
||||
|
||||
/// Download, build, and unzip a single wheel.
|
||||
async fn get_wheel(
|
||||
#[instrument(skip_all, fields(name = %dist, url = dist.file().unwrap().url))]
|
||||
pub async fn get_wheel(
|
||||
&self,
|
||||
dist: Dist,
|
||||
in_flight: &OnceMap<PathBuf, Result<CachedDist, String>>,
|
||||
|
|
2
scripts/popular_packages/.gitignore
vendored
Normal file
2
scripts/popular_packages/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
pypi_8k_downloads.txt
|
||||
pypi_10k_most_dependents.txt
|
86
scripts/popular_packages/pypi_10k_most_dependents.ipynb
Normal file
86
scripts/popular_packages/pypi_10k_most_dependents.ipynb
Normal file
|
@ -0,0 +1,86 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "initial_id",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"To update `pypi_10k_most_dependents.txt`, enter your `api_key` from https://libraries.io/account.\n",
|
||||
"\n",
|
||||
"The latest version is available at: https://gist.github.com/charliermarsh/07afd9f543dfea68408a4a42cede4be4.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"import httpx\n",
|
||||
"\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"api_key = \"\"\n",
|
||||
"responses = {}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d2532bf8c426af5",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range(100): # 100 pages with 100 per page -> 10k\n",
|
||||
" print(i)\n",
|
||||
" if not i in responses:\n",
|
||||
" # https://libraries.io/api#project-search\n",
|
||||
" sort = \"dependents_count\"\n",
|
||||
" url = f\"https://libraries.io/api/search?platforms=Pypi&per_page=100&page={i+1}&sort{sort}&api_key={api_key}\"\n",
|
||||
" responses[i] = httpx.get(url, timeout=30.0).json()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7bc80702b6f8ebc3",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"flat_list = []\n",
|
||||
"for response in responses.values():\n",
|
||||
" for entry in response:\n",
|
||||
" flat_list.append(entry[\"name\"])\n",
|
||||
"print(flat_list)\n",
|
||||
"Path().parent.joinpath(\"pypi_10k_most_dependents.txt\").write_text(\"\\n\".join(flat_list))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
curl https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json | jq -r ".rows | .[].project" > pypi_top_8k_flat.txt
|
||||
curl https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json | jq -r ".rows | .[].project" > pypi_8k_downloads.txt
|
||||
|
1
scripts/resolve/.gitignore
vendored
1
scripts/resolve/.gitignore
vendored
|
@ -1 +0,0 @@
|
|||
pypi_top_8k_flat.txt
|
Loading…
Add table
Add a link
Reference in a new issue