Refactor fetch-download-metadata.py script (#4853)

## Summary Similiar to https://github.com/astral-sh/rye/pull/680, I have made a major refactor to the `fetch-download-metadata.py` script. Some notable changes: - Use PEP 723 inline scripts - Fully type annotated the script - Implemented async HTTP fetching - Introduced a `Finder` base class and move finder logic under `CPythonFinder` subclass, which will make it easier to add a `PyPyFinder` later. - Instead of fetching `xxx.sha256` for each file, the script now fetches a single `SHA256SUMS` file containing checksums for all files in the release. As a result, the script now takes around 10 seconds instead of 10+ minutes. ## Plan for Future PRs - [ ] Implement the `PyPyFinder` - [ ] Add an GitHub Action to run `fetch-download-metadata.py` daily and create PR automatically ## Test Plan ```sh cargo run -- run --isolated -- ./crates/uv-python/fetch-download-metadata.py ```
2025-10-17 05:47:45 +00:00 · 2024-07-23 11:06:25 +08:00 · 2024-07-23 11:06:25 +08:00 · 0a6efe4d26
commit 0a6efe4d26
parent f371195536
2 changed files with 359 additions and 244 deletions
--- a/crates/uv-python/fetch-download-metadata.py
+++ b/crates/uv-python/fetch-download-metadata.py
@ -1,4 +1,9 @@
-#!/usr/bin/env python3.12
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#   "httpx < 1",
+# ]
+# ///
 """
 Fetch Python version download metadata.

@ -6,7 +11,7 @@ Generates the `download-metadata.json` file.

 Usage:

-    python fetch-download-metadata.py
+    uv run --isolated -- crates/uv-python/fetch-download-metadata.py

 Acknowledgements:

@ -14,259 +19,370 @@ Acknowledgements:
    Originally authored by Armin Ronacher under the MIT license
 """

+import abc
 import argparse
-import hashlib
+import asyncio
+import itertools
 import json
 import logging
-import re
 import os
-import urllib.error
-import urllib.request
-from itertools import chain
+import re
+from dataclasses import dataclass
+from enum import StrEnum
 from pathlib import Path
+from typing import Generator, Iterable, NamedTuple, Self
 from urllib.parse import unquote

-SELF_DIR = Path(__file__).parent
-RELEASE_URL = "https://api.github.com/repos/indygreg/python-build-standalone/releases"
-HEADERS = {
-    "X-GitHub-Api-Version": "2022-11-28",
-}
-VERSIONS_FILE = SELF_DIR / "download-metadata.json"
-FLAVOR_PREFERENCES = [
-    "install_only",
-    "shared-pgo",
-    "shared-noopt",
-    "shared-noopt",
-    "static-noopt",
-    "pgo+lto",
-    "lto",
-    "pgo",
-]
-HIDDEN_FLAVORS = [
-    "debug",
-    "noopt",
-]
-SPECIAL_TRIPLES = {
-    "macos": "x86_64-apple-darwin",
-    "linux64": "x86_64-unknown-linux-gnu",
-    "windows-amd64": "x86_64-pc-windows",
-    "windows-x86": "i686-pc-windows",
-    "windows-amd64-shared": "x86_64-pc-windows",
-    "windows-x86-shared": "i686-pc-windows",
-    "linux64-musl": "x86_64-unknown-linux-musl",
-}
+import httpx

-_filename_re = re.compile(
-    r"""(?x)
-    ^
-        cpython-(?P<ver>\d+\.\d+\.\d+?)
-        (?:\+\d+)?
-        -(?P<triple>.*?)
-        (?:-[\dT]+)?\.tar\.(?:gz|zst)
-    $
-"""
-)
-_flavor_re = re.compile(
-    r"""(?x)^(.*?)-(%s)$"""
-    % (
-        "|".join(
-            map(
-                re.escape,
-                sorted(FLAVOR_PREFERENCES + HIDDEN_FLAVORS, key=len, reverse=True),
+SELF_DIR = Path(__file__).parent
+VERSIONS_FILE = SELF_DIR / "download-metadata.json"
+
+
+def batched(iterable: Iterable, n: int) -> Generator[tuple, None, None]:
+    """Batch data into tuples of length n. The last batch may be shorter."""
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(itertools.islice(it, n)):
+        yield batch
+
+
+class PlatformTriple(NamedTuple):
+    arch: str
+    platform: str
+    libc: str
+
+
+class Version(NamedTuple):
+    major: int
+    minor: int
+    patch: int
+
+    @classmethod
+    def from_str(cls, version: str) -> Self:
+        major, minor, patch = version.split(".", 3)
+        return cls(int(major), int(minor), int(patch))
+
+    def __str__(self) -> str:
+        return f"{self.major}.{self.minor}.{self.patch}"
+
+
+class ImplementationName(StrEnum):
+    CPYTHON = "cpython"
+    PYPY = "pypy"
+
+
+@dataclass
+class PythonDownload:
+    version: Version
+    triple: PlatformTriple
+    flavor: str | None
+    implementation: ImplementationName
+    filename: str
+    url: str
+    sha256: str | None = None
+
+    def key(self) -> str:
+        return f"{self.implementation}-{self.version}-{self.triple.platform}-{self.triple.arch}-{self.triple.libc}"
+
+
+class Finder:
+    implementation: ImplementationName
+
+    @abc.abstractmethod
+    async def find(self) -> list[PythonDownload]:
+        raise NotImplementedError
+
+
+class CPythonFinder(Finder):
+    implementation = ImplementationName.CPYTHON
+
+    RELEASE_URL = (
+        "https://api.github.com/repos/indygreg/python-build-standalone/releases"
+    )
+
+    FLAVOR_PREFERENCES = [
+        "install_only",
+        "shared-pgo",
+        "shared-noopt",
+        "static-noopt",
+        "pgo+lto",
+        "pgo",
+        "lto",
+        "debug",
+    ]
+    HIDDEN_FLAVORS = [
+        "noopt",
+    ]
+    SPECIAL_TRIPLES = {
+        "macos": "x86_64-apple-darwin",
+        "linux64": "x86_64-unknown-linux-gnu",
+        "windows-amd64": "x86_64-pc-windows",
+        "windows-x86": "i686-pc-windows",
+        "windows-amd64-shared": "x86_64-pc-windows",
+        "windows-x86-shared": "i686-pc-windows",
+        "linux64-musl": "x86_64-unknown-linux-musl",
+    }
+    # Normalized mappings to match the Rust types
+    ARCH_MAP = {
+        "ppc64": "powerpc64",
+        "ppc64le": "powerpc64le",
+    }
+
+    _filename_re = re.compile(
+        r"""(?x)
+        ^
+            cpython-(?P<ver>\d+\.\d+\.\d+?)
+            (?:\+\d+)?
+            -(?P<triple>.*?)
+            (?:-[\dT]+)?\.tar\.(?:gz|zst)
+        $
+    """
+    )
+
+    _flavor_re = re.compile(
+        r"""(?x)^(.*?)-(%s)$"""
+        % (
+            "|".join(
+                map(
+                    re.escape,
+                    sorted(FLAVOR_PREFERENCES + HIDDEN_FLAVORS, key=len, reverse=True),
+                )
            )
        )
    )
-)

-# Normalized mappings to match the Rust types
-ARCH_MAP = {
-    "ppc64": "powerpc64",
-    "ppc64le": "powerpc64le",
-}
+    def __init__(self, client: httpx.AsyncClient):
+        self.client = client

+    async def find(self) -> list[PythonDownload]:
+        downloads = await self._fetch_downloads()
+        await self._fetch_checksums(downloads, n=20)
+        return downloads

-def parse_filename(filename):
-    match = _filename_re.match(filename)
-    if match is None:
-        return
-    version, triple = match.groups()
-    if triple.endswith("-full"):
-        triple = triple[:-5]
-    match = _flavor_re.match(triple)
-    if match is not None:
-        triple, flavor = match.groups()
-    else:
-        flavor = None
+    async def _fetch_downloads(self, pages: int = 100) -> list[PythonDownload]:
+        """Fetch all the indygreg downloads from the release API."""
+        results: dict[Version, list[PythonDownload]] = {}

-    return (version, triple, flavor)
-
-
-def normalize_triple(triple):
-    if "-static" in triple:
-        logging.debug("Skipping %r: static unsupported", triple)
-        return
-    triple = SPECIAL_TRIPLES.get(triple, triple)
-    pieces = triple.split("-")
-    try:
-        arch = normalize_arch(pieces[0])
-        operating_system = normalize_os(pieces[2])
-        if pieces[2] == "linux":
-            # On linux, the triple has four segments, the last one is the libc
-            libc = pieces[3]
-        else:
-            libc = "none"
-    except IndexError:
-        logging.debug("Skipping %r: unknown triple", triple)
-        return
-    return "%s-%s-%s" % (arch, operating_system, libc)
-
-
-def normalize_arch(arch):
-    arch = ARCH_MAP.get(arch, arch)
-    pieces = arch.split("_")
-    # Strip `_vN` from `x86_64`
-    return "_".join(pieces[:2])
-
-
-def normalize_os(os):
-    return os
-
-
-def read_sha256(url):
-    try:
-        resp = request(url + ".sha256")
-    except urllib.error.HTTPError:
-        return None
-    assert resp.status == 200
-    return resp.read().decode().strip()
-
-
-def sha256(path):
-    h = hashlib.sha256()
-
-    with open(path, "rb") as file:
-        while True:
-            # Reading is buffered, so we can read smaller chunks.
-            chunk = file.read(h.block_size)
-            if not chunk:
+        # Collect all available Python downloads
+        for page in range(1, pages + 1):
+            logging.info("Fetching CPython release page %d", page)
+            resp = await self.client.get(self.RELEASE_URL, params={"page": page})
+            resp.raise_for_status()
+            rows = resp.json()
+            if not rows:
                break
-            h.update(chunk)
+            for row in rows:
+                for asset in row["assets"]:
+                    url = asset["browser_download_url"]
+                    download = self._parse_download_url(url)
+                    if download is None:
+                        continue
+                    results.setdefault(download.version, []).append(download)

-    return h.hexdigest()
+        # Collapse CPython variants to a single URL flavor per triple
+        downloads = []
+        for choices in results.values():
+            flavors = {}
+            for choice in choices:
+                priority = self._get_flavor_priority(choice.flavor)
+                existing = flavors.get(choice.triple)
+                if existing:
+                    _, existing_priority = existing
+                    # Skip if we have a flavor with higher priority already (indicated by a smaller value)
+                    if priority >= existing_priority:
+                        continue
+                flavors[choice.triple] = (choice, priority)
+
+            # Drop the priorities
+            downloads.extend([choice for choice, _ in flavors.values()])
+
+        return downloads
+
+    async def _fetch_checksums(self, downloads: list[PythonDownload], n: int) -> None:
+        """Fetch the checksums for the given downloads."""
+        checksum_urls = set()
+        for download in downloads:
+            release_base_url = download.url.rsplit("/", maxsplit=1)[0]
+            checksum_url = release_base_url + "/SHA256SUMS"
+            checksum_urls.add(checksum_url)
+
+        async def fetch_checksums(url: str) -> httpx.Response | None:
+            try:
+                resp = await self.client.get(url)
+                resp.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                if e.response.status_code == 404:
+                    return None
+                raise
+            return resp
+
+        completed = 0
+        tasks = []
+        for batch in batched(checksum_urls, n):
+            logging.info(
+                "Fetching CPython checksums: %d/%d", completed, len(checksum_urls)
+            )
+            async with asyncio.TaskGroup() as tg:
+                for url in batch:
+                    task = tg.create_task(fetch_checksums(url))
+                    tasks.append(task)
+            completed += n
+
+        checksums = {}
+        for task in tasks:
+            resp = task.result()
+            if resp is None:
+                continue
+            lines = resp.text.splitlines()
+            for line in lines:
+                checksum, filename = line.split(" ", maxsplit=1)
+                filename = filename.strip()
+                checksums[filename] = checksum
+
+        for download in downloads:
+            download.sha256 = checksums.get(download.filename)
+
+    def _parse_download_url(self, url: str) -> PythonDownload | None:
+        """Parse an indygreg download URL into a PythonDownload object."""
+        # Ex)
+        # https://github.com/indygreg/python-build-standalone/releases/download/20240107/cpython-3.12.1%2B20240107-aarch64-unknown-linux-gnu-lto-full.tar.zst
+        if url.endswith(".sha256"):
+            return
+        filename = unquote(url.rsplit("/", maxsplit=1)[-1])
+
+        match = self._filename_re.match(filename)
+        if match is None:
+            return
+
+        version, triple = match.groups()
+        if triple.endswith("-full"):
+            triple = triple[:-5]
+
+        match = self._flavor_re.match(triple)
+        if match is not None:
+            triple, flavor = match.groups()
+        else:
+            flavor = None
+        if flavor in self.HIDDEN_FLAVORS:
+            return
+
+        version = Version.from_str(version)
+        triple = self._normalize_triple(triple)
+        if triple is None:
+            return
+
+        return PythonDownload(
+            version=version,
+            triple=triple,
+            flavor=flavor,
+            implementation=self.implementation,
+            filename=filename,
+            url=url,
+        )
+
+    def _normalize_triple(self, triple: str) -> PlatformTriple | None:
+        if "-static" in triple:
+            logging.debug("Skipping %r: static unsupported", triple)
+            return
+
+        triple = self.SPECIAL_TRIPLES.get(triple, triple)
+        pieces = triple.split("-")
+        try:
+            arch = self._normalize_arch(pieces[0])
+            operating_system = self._normalize_os(pieces[2])
+            if pieces[2] == "linux":
+                # On linux, the triple has four segments, the last one is the libc
+                libc = pieces[3]
+            else:
+                libc = "none"
+        except IndexError:
+            logging.debug("Skipping %r: unknown triple", triple)
+            return
+
+        return PlatformTriple(arch, operating_system, libc)
+
+    def _normalize_arch(self, arch: str) -> str:
+        arch = self.ARCH_MAP.get(arch, arch)
+        pieces = arch.split("_")
+        # Strip `_vN` from `x86_64`
+        return "_".join(pieces[:2])
+
+    def _normalize_os(self, os: str) -> str:
+        return os
+
+    def _get_flavor_priority(self, flavor: str | None) -> int:
+        """Returns the priority of a flavor. Lower is better."""
+        try:
+            pref = self.FLAVOR_PREFERENCES.index(flavor)
+        except ValueError:
+            pref = len(self.FLAVOR_PREFERENCES) + 1
+        return pref


-def _get_flavor_priority(flavor):
-    """
-    Returns the priority of a flavor. Lower is better."""
-    try:
-        pref = FLAVOR_PREFERENCES.index(flavor)
-    except ValueError:
-        pref = len(FLAVOR_PREFERENCES) + 1
-    return pref
+def render(downloads: list[PythonDownload]) -> None:
+    """Render `download-metadata.json`."""

+    def sort_key(download: PythonDownload) -> tuple:
+        # Sort by implementation, version (latest first), and then by triple.
+        impl_order = [ImplementationName.CPYTHON, ImplementationName.PYPY]
+        return (
+            impl_order.index(download.implementation),
+            -download.version.major,
+            -download.version.minor,
+            -download.version.patch,
+            download.triple,
+        )

-def _sort_by_interpreter_and_version(info):
-    interpreter, version_tuple, _ = info
-    return (interpreter, version_tuple)
+    downloads.sort(key=sort_key)

-
-def request(url):
-    request = urllib.request.Request(url)
-    token = os.getenv("GH_TOKEN")
-    if token:
-        request.add_header("Authorization", "Bearer: {token}")
-    return urllib.request.urlopen(request)
-
-
-def find():
-    """
-    Find available Python versions and write metadata to a file.
-    """
    results = {}
-
-    # Collect all available Python downloads
-    for page in range(1, 100):
-        logging.debug("Reading release page %s...", page)
-        resp = request("%s?page=%d" % (RELEASE_URL, page))
-        rows = json.loads(resp.read())
-        if not rows:
-            break
-        for row in rows:
-            for asset in row["assets"]:
-                url = asset["browser_download_url"]
-                base_name = unquote(url.rsplit("/")[-1])
-                if base_name.endswith(".sha256"):
-                    continue
-                info = parse_filename(base_name)
-                if info is None:
-                    continue
-                py_ver, triple, flavor = info
-                if "-static" in triple or (flavor and "noopt" in flavor):
-                    continue
-                triple = normalize_triple(triple)
-                if triple is None:
-                    logging.debug("Skipping %s: unsupported triple", url)
-                    continue
-                results.setdefault(py_ver, []).append((triple, flavor, url))
-
-    # Collapse CPython variants to a single URL flavor per triple
-    cpython_results: dict[tuple[int, int, int], dict[tuple[str, str, str], str]] = {}
-    for py_ver, choices in results.items():
-        urls = {}
-        for triple, flavor, url in choices:
-            triple = tuple(triple.split("-"))
-            priority = _get_flavor_priority(flavor)
-            existing = urls.get(triple)
-            if existing:
-                _, _, existing_priority = existing
-                # Skip if we have a flavor with higher priority already
-                if priority >= existing_priority:
-                    continue
-            urls[triple] = (url, flavor, priority)
-
-        # Drop the priorities
-        cpython_results[tuple(map(int, py_ver.split(".")))] = {
-            triple: (url, flavor) for triple, (url, flavor, _) in urls.items()
+    for download in downloads:
+        key = download.key()
+        logging.info("Found %s (%s)", key, download.flavor)
+        results[key] = {
+            "name": download.implementation,
+            "arch": download.triple.arch,
+            "os": download.triple.platform,
+            "libc": download.triple.libc,
+            "major": download.version.major,
+            "minor": download.version.minor,
+            "patch": download.version.patch,
+            "url": download.url,
+            "sha256": download.sha256,
        }

-    # Collect variants across interpreter kinds
-    # TODO(zanieb): Note we only support CPython downloads at this time
-    #               but this will include PyPy chain in the future.
-    final_results = {}
-    for interpreter, py_ver, choices in sorted(
-        chain(
-            (("cpython",) + x for x in cpython_results.items()),
-        ),
-        key=_sort_by_interpreter_and_version,
-        # Reverse the ordering so newer versions are first
-        reverse=True,
-    ):
-        # Sort by the remaining information for determinism
-        # This groups download metadata in triple component order
-        for (arch, operating_system, libc), (url, flavor) in sorted(choices.items()):
-            key = "%s-%s.%s.%s-%s-%s-%s" % (
-                interpreter,
-                *py_ver,
-                operating_system,
-                arch,
-                libc,
-            )
-            logging.info("Found %s (%s)", key, flavor)
-            sha256 = read_sha256(url)
-            final_results[key] = {
-                "name": interpreter,
-                "arch": arch,
-                "os": operating_system,
-                "libc": libc,
-                "major": py_ver[0],
-                "minor": py_ver[1],
-                "patch": py_ver[2],
-                "url": url,
-                "sha256": sha256,
-            }
-
    VERSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
-    VERSIONS_FILE.write_text(json.dumps(final_results, indent=2))
+    # Make newlines consistent across platforms
+    VERSIONS_FILE.write_text(json.dumps(results, indent=2), newline="\n")
+
+
+async def find() -> None:
+    token = os.environ.get("GH_TOKEN")
+    if not token:
+        logging.warning(
+            "`GH_TOKEN` env var not found, you may hit rate limits for GitHub API requests."
+        )
+
+    headers = {"X-GitHub-Api-Version": "2022-11-28"}
+    if token:
+        headers["Authorization"] = "Bearer " + token
+    client = httpx.AsyncClient(follow_redirects=True, headers=headers, timeout=15)
+
+    # TODO: Add PyPyFinder
+    finders = [
+        CPythonFinder(client),
+    ]
+    downloads = []
+
+    async with client:
+        for finder in finders:
+            logging.info("Finding %s downloads...", finder.implementation)
+            downloads.extend(await finder.find())
+
+    render(downloads)


 def main():
@ -297,8 +413,10 @@ def main():
        format="%(asctime)s %(levelname)s %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
+    # Silence httpx logging
+    logging.getLogger("httpx").setLevel(logging.WARNING)

-    find()
+    asyncio.run(find())


 if __name__ == "__main__":
--- a/crates/uv-python/template-download-metadata.py
+++ b/crates/uv-python/template-download-metadata.py
@ -1,4 +1,9 @@
-#!/usr/bin/env python3.12
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#   "chevron-blue < 1",
+# ]
+# ///
 """
 Generate static Rust code from Python version download metadata.

@ -6,7 +11,7 @@ Generates the `downloads.inc` file from the `downloads.inc.mustache` template.

 Usage:

-    uv run --isolated --with chevron-blue -- crates/uv-python/template-download-metadata.py
+    uv run --isolated -- crates/uv-python/template-download-metadata.py
 """

 import sys
@ -16,6 +21,8 @@ import json
 import subprocess
 from pathlib import Path

+import chevron_blue
+
 CRATE_ROOT = Path(__file__).parent
 WORKSPACE_ROOT = CRATE_ROOT.parent.parent
 VERSION_METADATA = CRATE_ROOT / "download-metadata.json"
@ -23,16 +30,6 @@ TEMPLATE = CRATE_ROOT / "src" / "downloads.inc.mustache"
 TARGET = TEMPLATE.with_suffix("")


-try:
-    import chevron_blue
-except ImportError:
-    print(
-        "missing requirement `chevron-blue`",
-        file=sys.stderr,
-    )
-    exit(1)
-
-
 def prepare_name(name: str) -> str:
    match name:
        case "cpython":
@ -73,8 +70,8 @@ def main():
    debug = logging.getLogger().getEffectiveLevel() <= logging.DEBUG

    data = {}
-    data["generated_with"] = Path(__file__).relative_to(WORKSPACE_ROOT)
-    data["generated_from"] = TEMPLATE.relative_to(WORKSPACE_ROOT)
+    data["generated_with"] = Path(__file__).relative_to(WORKSPACE_ROOT).as_posix()
+    data["generated_from"] = TEMPLATE.relative_to(WORKSPACE_ROOT).as_posix()
    data["versions"] = [
        {"key": key, "value": prepare_value(value)}
        for key, value in json.loads(VERSION_METADATA.read_text()).items()