Improvements to the Python metadata fetch script (#4780)

This fell out of my investigation of
https://github.com/astral-sh/uv/issues/4774 but the bug was fixed by the
reporter in #4775

- Adds support for `GH_TOKEN` authentication again — basically needed to
avoid rate limits when hacking on this.
- Clarifies some handling and logging of flavors
This commit is contained in:
Zanie Blue 2024-07-03 12:36:59 -04:00 committed by GitHub
parent 81442f0b4c
commit c0875fd8fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -19,6 +19,7 @@ import hashlib
import json import json
import logging import logging
import re import re
import os
import urllib.error import urllib.error
import urllib.request import urllib.request
from itertools import chain from itertools import chain
@ -65,7 +66,7 @@ _filename_re = re.compile(
$ $
""" """
) )
_suffix_re = re.compile( _flavor_re = re.compile(
r"""(?x)^(.*?)-(%s)$""" r"""(?x)^(.*?)-(%s)$"""
% ( % (
"|".join( "|".join(
@ -91,12 +92,13 @@ def parse_filename(filename):
version, triple = match.groups() version, triple = match.groups()
if triple.endswith("-full"): if triple.endswith("-full"):
triple = triple[:-5] triple = triple[:-5]
match = _suffix_re.match(triple) match = _flavor_re.match(triple)
if match is not None: if match is not None:
triple, suffix = match.groups() triple, flavor = match.groups()
else: else:
suffix = None flavor = None
return (version, triple, suffix)
return (version, triple, flavor)
def normalize_triple(triple): def normalize_triple(triple):
@ -132,7 +134,7 @@ def normalize_os(os):
def read_sha256(url): def read_sha256(url):
try: try:
resp = urllib.request.urlopen(url + ".sha256") resp = request(url + ".sha256")
except urllib.error.HTTPError: except urllib.error.HTTPError:
return None return None
assert resp.status == 200 assert resp.status == 200
@ -153,8 +155,9 @@ def sha256(path):
return h.hexdigest() return h.hexdigest()
def _sort_by_flavor_preference(info): def _get_flavor_priority(flavor):
_triple, flavor, _url = info """
Returns the priority of a flavor. Lower is better."""
try: try:
pref = FLAVOR_PREFERENCES.index(flavor) pref = FLAVOR_PREFERENCES.index(flavor)
except ValueError: except ValueError:
@ -167,6 +170,14 @@ def _sort_by_interpreter_and_version(info):
return (interpreter, version_tuple) return (interpreter, version_tuple)
def request(url):
request = urllib.request.Request(url)
token = os.getenv("GH_TOKEN")
if token:
request.add_header("Authorization", "Bearer: {token}")
return urllib.request.urlopen(request)
def find(): def find():
""" """
Find available Python versions and write metadata to a file. Find available Python versions and write metadata to a file.
@ -176,7 +187,7 @@ def find():
# Collect all available Python downloads # Collect all available Python downloads
for page in range(1, 100): for page in range(1, 100):
logging.debug("Reading release page %s...", page) logging.debug("Reading release page %s...", page)
resp = urllib.request.urlopen("%s?page=%d" % (RELEASE_URL, page)) resp = request("%s?page=%d" % (RELEASE_URL, page))
rows = json.loads(resp.read()) rows = json.loads(resp.read())
if not rows: if not rows:
break break
@ -194,6 +205,7 @@ def find():
continue continue
triple = normalize_triple(triple) triple = normalize_triple(triple)
if triple is None: if triple is None:
logging.debug("Skipping %s: unsupported triple", url)
continue continue
results.setdefault(py_ver, []).append((triple, flavor, url)) results.setdefault(py_ver, []).append((triple, flavor, url))
@ -201,13 +213,21 @@ def find():
cpython_results: dict[tuple[int, int, int], dict[tuple[str, str, str], str]] = {} cpython_results: dict[tuple[int, int, int], dict[tuple[str, str, str], str]] = {}
for py_ver, choices in results.items(): for py_ver, choices in results.items():
urls = {} urls = {}
for triple, flavor, url in sorted(choices, key=_sort_by_flavor_preference): for triple, flavor, url in choices:
triple = tuple(triple.split("-")) triple = tuple(triple.split("-"))
# Skip existing triples, preferring the first flavor priority = _get_flavor_priority(flavor)
if triple in urls: existing = urls.get(triple)
continue if existing:
urls[triple] = url _, _, existing_priority = existing
cpython_results[tuple(map(int, py_ver.split(".")))] = urls # Skip if we have a flavor with higher priority already
if priority >= existing_priority:
continue
urls[triple] = (url, flavor, priority)
# Drop the priorities
cpython_results[tuple(map(int, py_ver.split(".")))] = {
triple: (url, flavor) for triple, (url, flavor, _) in urls.items()
}
# Collect variants across interpreter kinds # Collect variants across interpreter kinds
# TODO(zanieb): Note we only support CPython downloads at this time # TODO(zanieb): Note we only support CPython downloads at this time
@ -223,7 +243,7 @@ def find():
): ):
# Sort by the remaining information for determinism # Sort by the remaining information for determinism
# This groups download metadata in triple component order # This groups download metadata in triple component order
for (arch, operating_system, libc), url in sorted(choices.items()): for (arch, operating_system, libc), (url, flavor) in sorted(choices.items()):
key = "%s-%s.%s.%s-%s-%s-%s" % ( key = "%s-%s.%s.%s-%s-%s-%s" % (
interpreter, interpreter,
*py_ver, *py_ver,
@ -231,9 +251,8 @@ def find():
arch, arch,
libc, libc,
) )
logging.info("Found %s", key) logging.info("Found %s (%s)", key, flavor)
sha256 = read_sha256(url) sha256 = read_sha256(url)
final_results[key] = { final_results[key] = {
"name": interpreter, "name": interpreter,
"arch": arch, "arch": arch,