Refactor fetch-download-metadata.py script (#4853)

## Summary

Similiar to https://github.com/astral-sh/rye/pull/680, I have made a
major refactor to the `fetch-download-metadata.py` script.

Some notable changes:

- Use PEP 723 inline scripts
- Fully type annotated the script
- Implemented async HTTP fetching
- Introduced a `Finder` base class and move finder logic under
`CPythonFinder` subclass, which will make it easier to add a
`PyPyFinder` later.
- Instead of fetching `xxx.sha256` for each file, the script now fetches
a single `SHA256SUMS` file containing checksums for all files in the
release.

As a result, the script now takes around 10 seconds instead of 10+
minutes.

## Plan for Future PRs

- [ ] Implement the `PyPyFinder`
- [ ] Add an GitHub Action to run `fetch-download-metadata.py` daily and
create PR automatically

## Test Plan

```sh
cargo run -- run --isolated -- ./crates/uv-python/fetch-download-metadata.py
```
This commit is contained in:
Jo 2024-07-23 11:06:25 +08:00 committed by GitHub
parent f371195536
commit 0a6efe4d26
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 359 additions and 244 deletions

View file

@ -1,4 +1,9 @@
#!/usr/bin/env python3.12
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "httpx < 1",
# ]
# ///
"""
Fetch Python version download metadata.
@ -6,7 +11,7 @@ Generates the `download-metadata.json` file.
Usage:
python fetch-download-metadata.py
uv run --isolated -- crates/uv-python/fetch-download-metadata.py
Acknowledgements:
@ -14,259 +19,370 @@ Acknowledgements:
Originally authored by Armin Ronacher under the MIT license
"""
import abc
import argparse
import hashlib
import asyncio
import itertools
import json
import logging
import re
import os
import urllib.error
import urllib.request
from itertools import chain
import re
from dataclasses import dataclass
from enum import StrEnum
from pathlib import Path
from typing import Generator, Iterable, NamedTuple, Self
from urllib.parse import unquote
SELF_DIR = Path(__file__).parent
RELEASE_URL = "https://api.github.com/repos/indygreg/python-build-standalone/releases"
HEADERS = {
"X-GitHub-Api-Version": "2022-11-28",
}
VERSIONS_FILE = SELF_DIR / "download-metadata.json"
FLAVOR_PREFERENCES = [
"install_only",
"shared-pgo",
"shared-noopt",
"shared-noopt",
"static-noopt",
"pgo+lto",
"lto",
"pgo",
]
HIDDEN_FLAVORS = [
"debug",
"noopt",
]
SPECIAL_TRIPLES = {
"macos": "x86_64-apple-darwin",
"linux64": "x86_64-unknown-linux-gnu",
"windows-amd64": "x86_64-pc-windows",
"windows-x86": "i686-pc-windows",
"windows-amd64-shared": "x86_64-pc-windows",
"windows-x86-shared": "i686-pc-windows",
"linux64-musl": "x86_64-unknown-linux-musl",
}
import httpx
_filename_re = re.compile(
r"""(?x)
^
cpython-(?P<ver>\d+\.\d+\.\d+?)
(?:\+\d+)?
-(?P<triple>.*?)
(?:-[\dT]+)?\.tar\.(?:gz|zst)
$
"""
)
_flavor_re = re.compile(
r"""(?x)^(.*?)-(%s)$"""
% (
"|".join(
map(
re.escape,
sorted(FLAVOR_PREFERENCES + HIDDEN_FLAVORS, key=len, reverse=True),
SELF_DIR = Path(__file__).parent
VERSIONS_FILE = SELF_DIR / "download-metadata.json"
def batched(iterable: Iterable, n: int) -> Generator[tuple, None, None]:
"""Batch data into tuples of length n. The last batch may be shorter."""
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
while batch := tuple(itertools.islice(it, n)):
yield batch
class PlatformTriple(NamedTuple):
arch: str
platform: str
libc: str
class Version(NamedTuple):
major: int
minor: int
patch: int
@classmethod
def from_str(cls, version: str) -> Self:
major, minor, patch = version.split(".", 3)
return cls(int(major), int(minor), int(patch))
def __str__(self) -> str:
return f"{self.major}.{self.minor}.{self.patch}"
class ImplementationName(StrEnum):
CPYTHON = "cpython"
PYPY = "pypy"
@dataclass
class PythonDownload:
version: Version
triple: PlatformTriple
flavor: str | None
implementation: ImplementationName
filename: str
url: str
sha256: str | None = None
def key(self) -> str:
return f"{self.implementation}-{self.version}-{self.triple.platform}-{self.triple.arch}-{self.triple.libc}"
class Finder:
implementation: ImplementationName
@abc.abstractmethod
async def find(self) -> list[PythonDownload]:
raise NotImplementedError
class CPythonFinder(Finder):
implementation = ImplementationName.CPYTHON
RELEASE_URL = (
"https://api.github.com/repos/indygreg/python-build-standalone/releases"
)
FLAVOR_PREFERENCES = [
"install_only",
"shared-pgo",
"shared-noopt",
"static-noopt",
"pgo+lto",
"pgo",
"lto",
"debug",
]
HIDDEN_FLAVORS = [
"noopt",
]
SPECIAL_TRIPLES = {
"macos": "x86_64-apple-darwin",
"linux64": "x86_64-unknown-linux-gnu",
"windows-amd64": "x86_64-pc-windows",
"windows-x86": "i686-pc-windows",
"windows-amd64-shared": "x86_64-pc-windows",
"windows-x86-shared": "i686-pc-windows",
"linux64-musl": "x86_64-unknown-linux-musl",
}
# Normalized mappings to match the Rust types
ARCH_MAP = {
"ppc64": "powerpc64",
"ppc64le": "powerpc64le",
}
_filename_re = re.compile(
r"""(?x)
^
cpython-(?P<ver>\d+\.\d+\.\d+?)
(?:\+\d+)?
-(?P<triple>.*?)
(?:-[\dT]+)?\.tar\.(?:gz|zst)
$
"""
)
_flavor_re = re.compile(
r"""(?x)^(.*?)-(%s)$"""
% (
"|".join(
map(
re.escape,
sorted(FLAVOR_PREFERENCES + HIDDEN_FLAVORS, key=len, reverse=True),
)
)
)
)
)
# Normalized mappings to match the Rust types
ARCH_MAP = {
"ppc64": "powerpc64",
"ppc64le": "powerpc64le",
}
def __init__(self, client: httpx.AsyncClient):
self.client = client
async def find(self) -> list[PythonDownload]:
downloads = await self._fetch_downloads()
await self._fetch_checksums(downloads, n=20)
return downloads
def parse_filename(filename):
match = _filename_re.match(filename)
if match is None:
return
version, triple = match.groups()
if triple.endswith("-full"):
triple = triple[:-5]
match = _flavor_re.match(triple)
if match is not None:
triple, flavor = match.groups()
else:
flavor = None
async def _fetch_downloads(self, pages: int = 100) -> list[PythonDownload]:
"""Fetch all the indygreg downloads from the release API."""
results: dict[Version, list[PythonDownload]] = {}
return (version, triple, flavor)
def normalize_triple(triple):
if "-static" in triple:
logging.debug("Skipping %r: static unsupported", triple)
return
triple = SPECIAL_TRIPLES.get(triple, triple)
pieces = triple.split("-")
try:
arch = normalize_arch(pieces[0])
operating_system = normalize_os(pieces[2])
if pieces[2] == "linux":
# On linux, the triple has four segments, the last one is the libc
libc = pieces[3]
else:
libc = "none"
except IndexError:
logging.debug("Skipping %r: unknown triple", triple)
return
return "%s-%s-%s" % (arch, operating_system, libc)
def normalize_arch(arch):
arch = ARCH_MAP.get(arch, arch)
pieces = arch.split("_")
# Strip `_vN` from `x86_64`
return "_".join(pieces[:2])
def normalize_os(os):
return os
def read_sha256(url):
try:
resp = request(url + ".sha256")
except urllib.error.HTTPError:
return None
assert resp.status == 200
return resp.read().decode().strip()
def sha256(path):
h = hashlib.sha256()
with open(path, "rb") as file:
while True:
# Reading is buffered, so we can read smaller chunks.
chunk = file.read(h.block_size)
if not chunk:
# Collect all available Python downloads
for page in range(1, pages + 1):
logging.info("Fetching CPython release page %d", page)
resp = await self.client.get(self.RELEASE_URL, params={"page": page})
resp.raise_for_status()
rows = resp.json()
if not rows:
break
h.update(chunk)
for row in rows:
for asset in row["assets"]:
url = asset["browser_download_url"]
download = self._parse_download_url(url)
if download is None:
continue
results.setdefault(download.version, []).append(download)
return h.hexdigest()
# Collapse CPython variants to a single URL flavor per triple
downloads = []
for choices in results.values():
flavors = {}
for choice in choices:
priority = self._get_flavor_priority(choice.flavor)
existing = flavors.get(choice.triple)
if existing:
_, existing_priority = existing
# Skip if we have a flavor with higher priority already (indicated by a smaller value)
if priority >= existing_priority:
continue
flavors[choice.triple] = (choice, priority)
# Drop the priorities
downloads.extend([choice for choice, _ in flavors.values()])
return downloads
async def _fetch_checksums(self, downloads: list[PythonDownload], n: int) -> None:
"""Fetch the checksums for the given downloads."""
checksum_urls = set()
for download in downloads:
release_base_url = download.url.rsplit("/", maxsplit=1)[0]
checksum_url = release_base_url + "/SHA256SUMS"
checksum_urls.add(checksum_url)
async def fetch_checksums(url: str) -> httpx.Response | None:
try:
resp = await self.client.get(url)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
return None
raise
return resp
completed = 0
tasks = []
for batch in batched(checksum_urls, n):
logging.info(
"Fetching CPython checksums: %d/%d", completed, len(checksum_urls)
)
async with asyncio.TaskGroup() as tg:
for url in batch:
task = tg.create_task(fetch_checksums(url))
tasks.append(task)
completed += n
checksums = {}
for task in tasks:
resp = task.result()
if resp is None:
continue
lines = resp.text.splitlines()
for line in lines:
checksum, filename = line.split(" ", maxsplit=1)
filename = filename.strip()
checksums[filename] = checksum
for download in downloads:
download.sha256 = checksums.get(download.filename)
def _parse_download_url(self, url: str) -> PythonDownload | None:
"""Parse an indygreg download URL into a PythonDownload object."""
# Ex)
# https://github.com/indygreg/python-build-standalone/releases/download/20240107/cpython-3.12.1%2B20240107-aarch64-unknown-linux-gnu-lto-full.tar.zst
if url.endswith(".sha256"):
return
filename = unquote(url.rsplit("/", maxsplit=1)[-1])
match = self._filename_re.match(filename)
if match is None:
return
version, triple = match.groups()
if triple.endswith("-full"):
triple = triple[:-5]
match = self._flavor_re.match(triple)
if match is not None:
triple, flavor = match.groups()
else:
flavor = None
if flavor in self.HIDDEN_FLAVORS:
return
version = Version.from_str(version)
triple = self._normalize_triple(triple)
if triple is None:
return
return PythonDownload(
version=version,
triple=triple,
flavor=flavor,
implementation=self.implementation,
filename=filename,
url=url,
)
def _normalize_triple(self, triple: str) -> PlatformTriple | None:
if "-static" in triple:
logging.debug("Skipping %r: static unsupported", triple)
return
triple = self.SPECIAL_TRIPLES.get(triple, triple)
pieces = triple.split("-")
try:
arch = self._normalize_arch(pieces[0])
operating_system = self._normalize_os(pieces[2])
if pieces[2] == "linux":
# On linux, the triple has four segments, the last one is the libc
libc = pieces[3]
else:
libc = "none"
except IndexError:
logging.debug("Skipping %r: unknown triple", triple)
return
return PlatformTriple(arch, operating_system, libc)
def _normalize_arch(self, arch: str) -> str:
arch = self.ARCH_MAP.get(arch, arch)
pieces = arch.split("_")
# Strip `_vN` from `x86_64`
return "_".join(pieces[:2])
def _normalize_os(self, os: str) -> str:
return os
def _get_flavor_priority(self, flavor: str | None) -> int:
"""Returns the priority of a flavor. Lower is better."""
try:
pref = self.FLAVOR_PREFERENCES.index(flavor)
except ValueError:
pref = len(self.FLAVOR_PREFERENCES) + 1
return pref
def _get_flavor_priority(flavor):
"""
Returns the priority of a flavor. Lower is better."""
try:
pref = FLAVOR_PREFERENCES.index(flavor)
except ValueError:
pref = len(FLAVOR_PREFERENCES) + 1
return pref
def render(downloads: list[PythonDownload]) -> None:
"""Render `download-metadata.json`."""
def sort_key(download: PythonDownload) -> tuple:
# Sort by implementation, version (latest first), and then by triple.
impl_order = [ImplementationName.CPYTHON, ImplementationName.PYPY]
return (
impl_order.index(download.implementation),
-download.version.major,
-download.version.minor,
-download.version.patch,
download.triple,
)
def _sort_by_interpreter_and_version(info):
interpreter, version_tuple, _ = info
return (interpreter, version_tuple)
downloads.sort(key=sort_key)
def request(url):
request = urllib.request.Request(url)
token = os.getenv("GH_TOKEN")
if token:
request.add_header("Authorization", "Bearer: {token}")
return urllib.request.urlopen(request)
def find():
"""
Find available Python versions and write metadata to a file.
"""
results = {}
# Collect all available Python downloads
for page in range(1, 100):
logging.debug("Reading release page %s...", page)
resp = request("%s?page=%d" % (RELEASE_URL, page))
rows = json.loads(resp.read())
if not rows:
break
for row in rows:
for asset in row["assets"]:
url = asset["browser_download_url"]
base_name = unquote(url.rsplit("/")[-1])
if base_name.endswith(".sha256"):
continue
info = parse_filename(base_name)
if info is None:
continue
py_ver, triple, flavor = info
if "-static" in triple or (flavor and "noopt" in flavor):
continue
triple = normalize_triple(triple)
if triple is None:
logging.debug("Skipping %s: unsupported triple", url)
continue
results.setdefault(py_ver, []).append((triple, flavor, url))
# Collapse CPython variants to a single URL flavor per triple
cpython_results: dict[tuple[int, int, int], dict[tuple[str, str, str], str]] = {}
for py_ver, choices in results.items():
urls = {}
for triple, flavor, url in choices:
triple = tuple(triple.split("-"))
priority = _get_flavor_priority(flavor)
existing = urls.get(triple)
if existing:
_, _, existing_priority = existing
# Skip if we have a flavor with higher priority already
if priority >= existing_priority:
continue
urls[triple] = (url, flavor, priority)
# Drop the priorities
cpython_results[tuple(map(int, py_ver.split(".")))] = {
triple: (url, flavor) for triple, (url, flavor, _) in urls.items()
for download in downloads:
key = download.key()
logging.info("Found %s (%s)", key, download.flavor)
results[key] = {
"name": download.implementation,
"arch": download.triple.arch,
"os": download.triple.platform,
"libc": download.triple.libc,
"major": download.version.major,
"minor": download.version.minor,
"patch": download.version.patch,
"url": download.url,
"sha256": download.sha256,
}
# Collect variants across interpreter kinds
# TODO(zanieb): Note we only support CPython downloads at this time
# but this will include PyPy chain in the future.
final_results = {}
for interpreter, py_ver, choices in sorted(
chain(
(("cpython",) + x for x in cpython_results.items()),
),
key=_sort_by_interpreter_and_version,
# Reverse the ordering so newer versions are first
reverse=True,
):
# Sort by the remaining information for determinism
# This groups download metadata in triple component order
for (arch, operating_system, libc), (url, flavor) in sorted(choices.items()):
key = "%s-%s.%s.%s-%s-%s-%s" % (
interpreter,
*py_ver,
operating_system,
arch,
libc,
)
logging.info("Found %s (%s)", key, flavor)
sha256 = read_sha256(url)
final_results[key] = {
"name": interpreter,
"arch": arch,
"os": operating_system,
"libc": libc,
"major": py_ver[0],
"minor": py_ver[1],
"patch": py_ver[2],
"url": url,
"sha256": sha256,
}
VERSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
VERSIONS_FILE.write_text(json.dumps(final_results, indent=2))
# Make newlines consistent across platforms
VERSIONS_FILE.write_text(json.dumps(results, indent=2), newline="\n")
async def find() -> None:
token = os.environ.get("GH_TOKEN")
if not token:
logging.warning(
"`GH_TOKEN` env var not found, you may hit rate limits for GitHub API requests."
)
headers = {"X-GitHub-Api-Version": "2022-11-28"}
if token:
headers["Authorization"] = "Bearer " + token
client = httpx.AsyncClient(follow_redirects=True, headers=headers, timeout=15)
# TODO: Add PyPyFinder
finders = [
CPythonFinder(client),
]
downloads = []
async with client:
for finder in finders:
logging.info("Finding %s downloads...", finder.implementation)
downloads.extend(await finder.find())
render(downloads)
def main():
@ -297,8 +413,10 @@ def main():
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
# Silence httpx logging
logging.getLogger("httpx").setLevel(logging.WARNING)
find()
asyncio.run(find())
if __name__ == "__main__":

View file

@ -1,4 +1,9 @@
#!/usr/bin/env python3.12
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "chevron-blue < 1",
# ]
# ///
"""
Generate static Rust code from Python version download metadata.
@ -6,7 +11,7 @@ Generates the `downloads.inc` file from the `downloads.inc.mustache` template.
Usage:
uv run --isolated --with chevron-blue -- crates/uv-python/template-download-metadata.py
uv run --isolated -- crates/uv-python/template-download-metadata.py
"""
import sys
@ -16,6 +21,8 @@ import json
import subprocess
from pathlib import Path
import chevron_blue
CRATE_ROOT = Path(__file__).parent
WORKSPACE_ROOT = CRATE_ROOT.parent.parent
VERSION_METADATA = CRATE_ROOT / "download-metadata.json"
@ -23,16 +30,6 @@ TEMPLATE = CRATE_ROOT / "src" / "downloads.inc.mustache"
TARGET = TEMPLATE.with_suffix("")
try:
import chevron_blue
except ImportError:
print(
"missing requirement `chevron-blue`",
file=sys.stderr,
)
exit(1)
def prepare_name(name: str) -> str:
match name:
case "cpython":
@ -73,8 +70,8 @@ def main():
debug = logging.getLogger().getEffectiveLevel() <= logging.DEBUG
data = {}
data["generated_with"] = Path(__file__).relative_to(WORKSPACE_ROOT)
data["generated_from"] = TEMPLATE.relative_to(WORKSPACE_ROOT)
data["generated_with"] = Path(__file__).relative_to(WORKSPACE_ROOT).as_posix()
data["generated_from"] = TEMPLATE.relative_to(WORKSPACE_ROOT).as_posix()
data["versions"] = [
{"key": key, "value": prepare_value(value)}
for key, value in json.loads(VERSION_METADATA.read_text()).items()