uv/crates/uv-toolchain/fetch-version-metadata.py
Zanie Blue 44e39bdca3
Replace Python bootstrapping script with Rust implementation (#2842)
See https://github.com/astral-sh/uv/issues/2617

Note this also includes:
- #2918 
- #2931 (pending)

A first step towards Python toolchain management in Rust.

First, we add a new crate to manage Python download metadata:

- Adds a new `uv-toolchain` crate
- Adds Rust structs for Python version download metadata
- Duplicates the script which downloads Python version metadata
- Adds a script to generate Rust code from the JSON metadata
- Adds a utility to download and extract the Python version

I explored some alternatives like a build script using things like
`serde` and `uneval` to automatically construct the code from our
structs but deemed it to heavy. Unlike Rye, I don't generate the Rust
directly from the web requests and have an intermediate JSON layer to
speed up iteration on the Rust types.

Next, we add add a `uv-dev` command `fetch-python` to download Python
versions per the bootstrapping script.

- Downloads a requested version or reads from `.python-versions`
- Extracts to `UV_BOOTSTRAP_DIR`
- Links executables for path extension

This command is not really intended to be user facing, but it's a good
PoC for the `uv-toolchain` API. Hash checking (via the sha256) isn't
implemented yet, we can do that in a follow-up.

Finally, we remove the `scripts/bootstrap` directory, update CI to use
the new command, and update the CONTRIBUTING docs.

<img width="1023" alt="Screenshot 2024-04-08 at 17 12 15"
src="57bd3cf1-7477-4bb8-a8e9-802a00d772cb">
2024-04-10 11:22:41 -05:00

292 lines
7.7 KiB
Python
Executable file

#!/usr/bin/env python3.12
"""
Fetch Python version metadata.
Generates the `python-version-metadata.json` file.
Usage:
python fetch-version-metadata.py
Acknowledgements:
Derived from https://github.com/mitsuhiko/rye/tree/f9822267a7f00332d15be8551f89a212e7bc9017
Originally authored by Armin Ronacher under the MIT license
"""
import argparse
import hashlib
import json
import logging
import re
import urllib.error
import urllib.request
from itertools import chain
from pathlib import Path
from urllib.parse import unquote
SELF_DIR = Path(__file__).parent
RELEASE_URL = "https://api.github.com/repos/indygreg/python-build-standalone/releases"
HEADERS = {
"X-GitHub-Api-Version": "2022-11-28",
}
VERSIONS_FILE = SELF_DIR / "python-version-metadata.json"
FLAVOR_PREFERENCES = [
"shared-pgo",
"shared-noopt",
"shared-noopt",
"static-noopt",
"gnu-pgo+lto",
"gnu-lto",
"gnu-pgo",
"pgo+lto",
"lto",
"pgo",
]
HIDDEN_FLAVORS = [
"debug",
"noopt",
"install_only",
]
SPECIAL_TRIPLES = {
"macos": "x86_64-apple-darwin",
"linux64": "x86_64-unknown-linux-gnu",
"windows-amd64": "x86_64-pc-windows",
"windows-x86": "i686-pc-windows",
"windows-amd64-shared": "x86_64-pc-windows",
"windows-x86-shared": "i686-pc-windows",
"linux64-musl": "x86_64-unknown-linux-musl",
}
_filename_re = re.compile(
r"""(?x)
^
cpython-(?P<ver>\d+\.\d+\.\d+?)
(?:\+\d+)?
-(?P<triple>.*?)
(?:-[\dT]+)?\.tar\.(?:gz|zst)
$
"""
)
_suffix_re = re.compile(
r"""(?x)^(.*?)-(%s)$"""
% (
"|".join(
map(
re.escape,
sorted(FLAVOR_PREFERENCES + HIDDEN_FLAVORS, key=len, reverse=True),
)
)
)
)
# Normalized mappings to match the Rust types
ARCH_MAP = {
"ppc64": "powerpc64",
"ppc64le": "powerpc64le",
"i686": "x86",
"i386": "x86",
}
OS_MAP = {"darwin": "macos"}
def parse_filename(filename):
match = _filename_re.match(filename)
if match is None:
return
version, triple = match.groups()
if triple.endswith("-full"):
triple = triple[:-5]
match = _suffix_re.match(triple)
if match is not None:
triple, suffix = match.groups()
else:
suffix = None
return (version, triple, suffix)
def normalize_triple(triple):
if "-static" in triple:
logging.debug("Skipping %r: unknown triple", triple)
return
triple = SPECIAL_TRIPLES.get(triple, triple)
pieces = triple.split("-")
try:
arch = normalize_arch(pieces[0])
operating_system = normalize_os(pieces[2])
if pieces[2] == "linux":
# On linux, the triple has four segments, the last one is the libc
libc = pieces[3]
else:
libc = "none"
except IndexError:
logging.debug("Skipping %r: unknown triple", triple)
return
return "%s-%s-%s" % (arch, operating_system, libc)
def normalize_arch(arch):
arch = ARCH_MAP.get(arch, arch)
pieces = arch.split("_")
# Strip `_vN` from `x86_64`
return "_".join(pieces[:2])
def normalize_os(os):
return OS_MAP.get(os, os)
def read_sha256(url):
try:
resp = urllib.request.urlopen(url + ".sha256")
except urllib.error.HTTPError:
return None
assert resp.status == 200
return resp.read().decode().strip()
def sha256(path):
h = hashlib.sha256()
with open(path, "rb") as file:
while True:
# Reading is buffered, so we can read smaller chunks.
chunk = file.read(h.block_size)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
def _sort_by_flavor_preference(info):
_triple, flavor, _url = info
try:
pref = FLAVOR_PREFERENCES.index(flavor)
except ValueError:
pref = len(FLAVOR_PREFERENCES) + 1
return pref
def _sort_by_interpreter_and_version(info):
interpreter, version_tuple, _ = info
return (interpreter, version_tuple)
def find():
"""
Find available Python versions and write metadata to a file.
"""
results = {}
# Collect all available Python downloads
for page in range(1, 100):
logging.debug("Reading release page %s...", page)
resp = urllib.request.urlopen("%s?page=%d" % (RELEASE_URL, page))
rows = json.loads(resp.read())
if not rows:
break
for row in rows:
for asset in row["assets"]:
url = asset["browser_download_url"]
base_name = unquote(url.rsplit("/")[-1])
if base_name.endswith(".sha256"):
continue
info = parse_filename(base_name)
if info is None:
continue
py_ver, triple, flavor = info
if "-static" in triple or (flavor and "noopt" in flavor):
continue
triple = normalize_triple(triple)
if triple is None:
continue
results.setdefault(py_ver, []).append((triple, flavor, url))
# Collapse CPython variants to a single URL flavor per triple
cpython_results: dict[tuple[int, int, int], dict[tuple[str, str, str], str]] = {}
for py_ver, choices in results.items():
urls = {}
for triple, flavor, url in sorted(choices, key=_sort_by_flavor_preference):
triple = tuple(triple.split("-"))
# Skip existing triples, preferring the first flavor
if triple in urls:
continue
urls[triple] = url
cpython_results[tuple(map(int, py_ver.split(".")))] = urls
# Collect variants across interpreter kinds
# TODO(zanieb): Note we only support CPython downloads at this time
# but this will include PyPy chain in the future.
final_results = {}
for interpreter, py_ver, choices in sorted(
chain(
(("cpython",) + x for x in cpython_results.items()),
),
key=_sort_by_interpreter_and_version,
# Reverse the ordering so newer versions are first
reverse=True,
):
# Sort by the remaining information for determinism
# This groups download metadata in triple component order
for (arch, operating_system, libc), url in sorted(choices.items()):
key = "%s-%s.%s.%s-%s-%s-%s" % (
interpreter,
*py_ver,
operating_system,
arch,
libc,
)
logging.info("Found %s", key)
sha256 = read_sha256(url)
final_results[key] = {
"name": interpreter,
"arch": arch,
"os": operating_system,
"libc": libc,
"major": py_ver[0],
"minor": py_ver[1],
"patch": py_ver[2],
"url": url,
"sha256": sha256,
}
VERSIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
VERSIONS_FILE.write_text(json.dumps(final_results, indent=2))
def main():
parser = argparse.ArgumentParser(description="Fetch Python version metadata.")
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable debug logging",
)
parser.add_argument(
"-q",
"--quiet",
action="store_true",
help="Disable logging",
)
args = parser.parse_args()
if args.quiet:
log_level = logging.CRITICAL
elif args.verbose:
log_level = logging.DEBUG
else:
log_level = logging.INFO
logging.basicConfig(
level=log_level,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
find()
if __name__ == "__main__":
main()