# /// script # requires-python = ">=3.12" # dependencies = [ # "httpx < 1", # ] # /// """ Fetch Python version download metadata. Generates the `download-metadata.json` file. Usage: uv run --isolated -- crates/uv-python/fetch-download-metadata.py Acknowledgements: Derived from https://github.com/mitsuhiko/rye/tree/f9822267a7f00332d15be8551f89a212e7bc9017 Originally authored by Armin Ronacher under the MIT license """ import abc import argparse import asyncio import itertools import json import logging import os import re from dataclasses import dataclass from enum import StrEnum from pathlib import Path from typing import Generator, Iterable, NamedTuple, Self from urllib.parse import unquote import httpx SELF_DIR = Path(__file__).parent VERSIONS_FILE = SELF_DIR / "download-metadata.json" def batched(iterable: Iterable, n: int) -> Generator[tuple, None, None]: """Batch data into tuples of length n. The last batch may be shorter.""" # batched('ABCDEFG', 3) --> ABC DEF G if n < 1: raise ValueError("n must be at least one") it = iter(iterable) while batch := tuple(itertools.islice(it, n)): yield batch class PlatformTriple(NamedTuple): arch: str platform: str libc: str class Version(NamedTuple): major: int minor: int patch: int @classmethod def from_str(cls, version: str) -> Self: major, minor, patch = version.split(".", 3) return cls(int(major), int(minor), int(patch)) def __str__(self) -> str: return f"{self.major}.{self.minor}.{self.patch}" class ImplementationName(StrEnum): CPYTHON = "cpython" PYPY = "pypy" @dataclass class PythonDownload: version: Version triple: PlatformTriple flavor: str implementation: ImplementationName filename: str url: str sha256: str | None = None def key(self) -> str: return f"{self.implementation}-{self.version}-{self.triple.platform}-{self.triple.arch}-{self.triple.libc}" class Finder: implementation: ImplementationName @abc.abstractmethod async def find(self) -> list[PythonDownload]: raise NotImplementedError class CPythonFinder(Finder): implementation = ImplementationName.CPYTHON RELEASE_URL = ( "https://api.github.com/repos/indygreg/python-build-standalone/releases" ) FLAVOR_PREFERENCES = [ "install_only_stripped", "install_only", "shared-pgo", "shared-noopt", "static-noopt", "pgo+lto", "pgo", "lto", "debug", ] HIDDEN_FLAVORS = [ "noopt", ] SPECIAL_TRIPLES = { "macos": "x86_64-apple-darwin", "linux64": "x86_64-unknown-linux-gnu", "windows-amd64": "x86_64-pc-windows", "windows-x86": "i686-pc-windows", "windows-amd64-shared": "x86_64-pc-windows", "windows-x86-shared": "i686-pc-windows", "linux64-musl": "x86_64-unknown-linux-musl", } # Normalized mappings to match the Rust types ARCH_MAP = { "ppc64": "powerpc64", "ppc64le": "powerpc64le", } _filename_re = re.compile( r"""(?x) ^ cpython-(?P\d+\.\d+\.\d+?) (?:\+\d+)? -(?P.*?) (?:-[\dT]+)?\.tar\.(?:gz|zst) $ """ ) _flavor_re = re.compile( r"""(?x)^(.*?)-(%s)$""" % ( "|".join( map( re.escape, sorted(FLAVOR_PREFERENCES + HIDDEN_FLAVORS, key=len, reverse=True), ) ) ) ) def __init__(self, client: httpx.AsyncClient): self.client = client async def find(self) -> list[PythonDownload]: downloads = await self._fetch_downloads() await self._fetch_checksums(downloads, n=20) return downloads async def _fetch_downloads(self, pages: int = 100) -> list[PythonDownload]: """Fetch all the indygreg downloads from the release API.""" results: dict[Version, list[PythonDownload]] = {} # Collect all available Python downloads for page in range(1, pages + 1): logging.info("Fetching CPython release page %d", page) resp = await self.client.get(self.RELEASE_URL, params={"page": page}) resp.raise_for_status() rows = resp.json() if not rows: break for row in rows: for asset in row["assets"]: url = asset["browser_download_url"] download = self._parse_download_url(url) if download is None: continue results.setdefault(download.version, []).append(download) # Collapse CPython variants to a single URL flavor per triple downloads = [] for choices in results.values(): flavors: dict[PlatformTriple, tuple[PythonDownload, int]] = {} for choice in choices: priority = self._get_flavor_priority(choice.flavor) existing = flavors.get(choice.triple) if existing: _, existing_priority = existing # Skip if we have a flavor with higher priority already (indicated by a smaller value) if priority >= existing_priority: continue flavors[choice.triple] = (choice, priority) # Drop the priorities downloads.extend([choice for choice, _ in flavors.values()]) return downloads async def _fetch_checksums(self, downloads: list[PythonDownload], n: int) -> None: """Fetch the checksums for the given downloads.""" checksum_urls = set() for download in downloads: release_base_url = download.url.rsplit("/", maxsplit=1)[0] checksum_url = release_base_url + "/SHA256SUMS" checksum_urls.add(checksum_url) async def fetch_checksums(url: str) -> httpx.Response | None: try: resp = await self.client.get(url) resp.raise_for_status() except httpx.HTTPStatusError as e: if e.response.status_code == 404: return None raise return resp completed = 0 tasks = [] for batch in batched(checksum_urls, n): logging.info( "Fetching CPython checksums: %d/%d", completed, len(checksum_urls) ) async with asyncio.TaskGroup() as tg: for url in batch: task = tg.create_task(fetch_checksums(url)) tasks.append(task) completed += n checksums = {} for task in tasks: resp = task.result() if resp is None: continue lines = resp.text.splitlines() for line in lines: checksum, filename = line.split(" ", maxsplit=1) filename = filename.strip() checksums[filename] = checksum for download in downloads: download.sha256 = checksums.get(download.filename) def _parse_download_url(self, url: str) -> PythonDownload | None: """Parse an indygreg download URL into a PythonDownload object.""" # Ex) # https://github.com/indygreg/python-build-standalone/releases/download/20240107/cpython-3.12.1%2B20240107-aarch64-unknown-linux-gnu-lto-full.tar.zst if url.endswith(".sha256"): return None filename = unquote(url.rsplit("/", maxsplit=1)[-1]) match = self._filename_re.match(filename) if match is None: return None version, triple = match.groups() if triple.endswith("-full"): triple = triple[:-5] match = self._flavor_re.match(triple) if match is not None: triple, flavor = match.groups() else: flavor = "" if flavor in self.HIDDEN_FLAVORS: return None version = Version.from_str(version) triple = self._normalize_triple(triple) if triple is None: return None return PythonDownload( version=version, triple=triple, flavor=flavor, implementation=self.implementation, filename=filename, url=url, ) def _normalize_triple(self, triple: str) -> PlatformTriple | None: if "-static" in triple: logging.debug("Skipping %r: static unsupported", triple) return None triple = self.SPECIAL_TRIPLES.get(triple, triple) pieces = triple.split("-") try: arch = self._normalize_arch(pieces[0]) operating_system = self._normalize_os(pieces[2]) if pieces[2] == "linux": # On linux, the triple has four segments, the last one is the libc libc = pieces[3] else: libc = "none" except IndexError: logging.debug("Skipping %r: unknown triple", triple) return None return PlatformTriple(arch, operating_system, libc) def _normalize_arch(self, arch: str) -> str: arch = self.ARCH_MAP.get(arch, arch) pieces = arch.split("_") # Strip `_vN` from `x86_64` return "_".join(pieces[:2]) def _normalize_os(self, os: str) -> str: return os def _get_flavor_priority(self, flavor: str) -> int: """Returns the priority of a flavor. Lower is better.""" try: pref = self.FLAVOR_PREFERENCES.index(flavor) except ValueError: pref = len(self.FLAVOR_PREFERENCES) + 1 return pref class PyPyFinder(Finder): implementation = ImplementationName.PYPY RELEASE_URL = "https://raw.githubusercontent.com/pypy/pypy/main/pypy/tool/release/versions.json" CHECKSUM_URL = ( "https://raw.githubusercontent.com/pypy/pypy.org/main/pages/checksums.rst" ) _checksum_re = re.compile( r"^\s*(?P\w{64})\s+(?Ppypy.+)$", re.MULTILINE ) ARCH_MAPPING = { "x64": "x86_64", "x86": "i686", "i686": "i686", "aarch64": "aarch64", "arm64": "aarch64", "s390x": "s390x", } PLATFORM_MAPPING = { "win32": "windows", "win64": "windows", "linux": "linux", "darwin": "darwin", } def __init__(self, client: httpx.AsyncClient): self.client = client async def find(self) -> list[PythonDownload]: downloads = await self._fetch_downloads() await self._fetch_checksums(downloads) return downloads async def _fetch_downloads(self) -> list[PythonDownload]: resp = await self.client.get(self.RELEASE_URL) resp.raise_for_status() versions = resp.json() results = {} for version in versions: if not version["stable"]: continue python_version = Version.from_str(version["python_version"]) if python_version < (3, 7, 0): continue for file in version["files"]: arch = self._normalize_arch(file["arch"]) platform = self._normalize_os(file["platform"]) libc = "gnu" if platform == "linux" else "none" download = PythonDownload( version=python_version, triple=PlatformTriple( arch=arch, platform=platform, libc=libc, ), flavor="", implementation=self.implementation, filename=file["filename"], url=file["download_url"], ) # Only keep the latest pypy version of each arch/platform if (python_version, arch, platform) not in results: results[(python_version, arch, platform)] = download return list(results.values()) def _normalize_arch(self, arch: str) -> str: return self.ARCH_MAPPING.get(arch, arch) def _normalize_os(self, os: str) -> str: return self.PLATFORM_MAPPING.get(os, os) async def _fetch_checksums(self, downloads: list[PythonDownload]) -> None: logging.info("Fetching PyPy checksums") resp = await self.client.get(self.CHECKSUM_URL) resp.raise_for_status() text = resp.text checksums = {} for match in self._checksum_re.finditer(text): checksums[match.group("filename")] = match.group("checksum") for download in downloads: download.sha256 = checksums.get(download.filename) def render(downloads: list[PythonDownload]) -> None: """Render `download-metadata.json`.""" def sort_key(download: PythonDownload) -> tuple: # Sort by implementation, version (latest first), and then by triple. impl_order = [ImplementationName.CPYTHON, ImplementationName.PYPY] return ( impl_order.index(download.implementation), -download.version.major, -download.version.minor, -download.version.patch, download.triple, ) downloads.sort(key=sort_key) results = {} for download in downloads: key = download.key() logging.info("Found %s (%s)", key, download.flavor) results[key] = { "name": download.implementation, "arch": download.triple.arch, "os": download.triple.platform, "libc": download.triple.libc, "major": download.version.major, "minor": download.version.minor, "patch": download.version.patch, "url": download.url, "sha256": download.sha256, } VERSIONS_FILE.parent.mkdir(parents=True, exist_ok=True) # Make newlines consistent across platforms VERSIONS_FILE.write_text(json.dumps(results, indent=2), newline="\n") async def find() -> None: token = os.environ.get("GITHUB_TOKEN") if not token: logging.warning( "`GITHUB_TOKEN` env var not found, you may hit rate limits for GitHub API requests." ) headers = {"X-GitHub-Api-Version": "2022-11-28"} if token: headers["Authorization"] = "Bearer " + token client = httpx.AsyncClient(follow_redirects=True, headers=headers, timeout=15) finders = [ CPythonFinder(client), PyPyFinder(client), ] downloads = [] async with client: for finder in finders: logging.info("Finding %s downloads...", finder.implementation) downloads.extend(await finder.find()) render(downloads) def main() -> None: parser = argparse.ArgumentParser(description="Fetch Python version metadata.") parser.add_argument( "-v", "--verbose", action="store_true", help="Enable debug logging", ) parser.add_argument( "-q", "--quiet", action="store_true", help="Disable logging", ) args = parser.parse_args() if args.quiet: log_level = logging.CRITICAL elif args.verbose: log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig( level=log_level, format="%(asctime)s %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) # Silence httpx logging logging.getLogger("httpx").setLevel(logging.WARNING) asyncio.run(find()) if __name__ == "__main__": main()