Extended ecosystem check with scraped data (#3858)

This commit is contained in:
konstin 2023-04-07 00:39:48 +02:00 committed by GitHub
parent cae5503e34
commit 454c6d9c2f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 140 additions and 13 deletions

View file

@ -0,0 +1,22 @@
# [crater](https://github.com/rust-lang/crater)-inspired check that tests against a large number of
# projects, mainly from https://github.com/akx/ruff-usage-aggregate.
#
# We run this in a Docker container as Ruff isn't designed for untrusted inputs.
#
# Either download https://github.com/akx/ruff-usage-aggregate/blob/master/data/known-github-tomls.jsonl as
# `github_search.jsonl` or follow the instructions in the README to scrape your own dataset.
#
# From the project root:
# ```
# cargo build
# docker buildx build -f scripts/Dockerfile.ecosystem -t ruff-ecosystem-checker --load .
# docker run --rm ruff-ecosystem-checker
# ```
FROM python:3.11
RUN python -m venv .venv && .venv/bin/pip install ruff
ADD scripts/check_ecosystem.py check_ecosystem.py
ADD github_search.jsonl github_search.jsonl
ADD target/debug/ruff ruff-new
CMD ["python", "check_ecosystem.py", "--verbose", "--projects", "github_search.jsonl", "ruff-new", ".venv/bin/ruff"]

View file

@ -6,29 +6,31 @@ Example usage:
scripts/check_ecosystem.py <path/to/ruff1> <path/to/ruff2>
"""
# ruff: noqa: T201
import argparse
import asyncio
import difflib
import heapq
import json
import logging
import re
import tempfile
from asyncio.subprocess import PIPE, create_subprocess_exec
from contextlib import asynccontextmanager
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple, Self
from typing import TYPE_CHECKING, NamedTuple, Optional, Self
if TYPE_CHECKING:
from collections.abc import AsyncIterator, Iterator, Sequence
logger = logging.getLogger(__name__)
class Repository(NamedTuple):
"""A GitHub repository at a specific ref."""
org: str
repo: str
ref: str
ref: Optional[str]
select: str = ""
ignore: str = ""
exclude: str = ""
@ -37,7 +39,8 @@ class Repository(NamedTuple):
async def clone(self: Self) -> "AsyncIterator[Path]":
"""Shallow clone this repository to a temporary directory."""
with tempfile.TemporaryDirectory() as tmpdir:
process = await create_subprocess_exec(
logger.debug(f"Cloning {self.org}/{self.repo}")
git_command = [
"git",
"clone",
"--config",
@ -46,14 +49,23 @@ class Repository(NamedTuple):
"--depth",
"1",
"--no-tags",
"--branch",
self.ref,
f"https://github.com/{self.org}/{self.repo}",
tmpdir,
]
if self.ref:
git_command.extend(["--branch", self.ref])
git_command.extend(
[
f"https://github.com/{self.org}/{self.repo}",
tmpdir,
],
)
process = await create_subprocess_exec(*git_command)
await process.wait()
logger.debug(f"Finished cloning {self.org}/{self.repo}")
yield Path(tmpdir)
@ -80,11 +92,13 @@ async def check(
*,
ruff: Path,
path: Path,
name: str,
select: str = "",
ignore: str = "",
exclude: str = "",
) -> "Sequence[str]":
"""Run the given ruff binary against the specified path."""
logger.debug(f"Checking {name} with {ruff}")
ruff_args = ["check", "--no-cache", "--exit-zero"]
if select:
ruff_args.extend(["--select", select])
@ -103,6 +117,8 @@ async def check(
result, err = await proc.communicate()
logger.debug(f"Finished checking {name} with {ruff}")
if proc.returncode != 0:
raise RuffError(err.decode("utf8"))
@ -145,6 +161,7 @@ async def compare(ruff1: Path, ruff2: Path, repo: Repository) -> Diff | None:
check(
ruff=ruff1,
path=path,
name=f"{repo.org}/{repo.repo}",
select=repo.select,
ignore=repo.ignore,
exclude=repo.exclude,
@ -154,6 +171,7 @@ async def compare(ruff1: Path, ruff2: Path, repo: Repository) -> Diff | None:
check(
ruff=ruff2,
path=path,
name=f"{repo.org}/{repo.repo}",
select=repo.select,
ignore=repo.ignore,
exclude=repo.exclude,
@ -171,14 +189,58 @@ async def compare(ruff1: Path, ruff2: Path, repo: Repository) -> Diff | None:
return Diff(removed, added)
async def main(*, ruff1: Path, ruff2: Path) -> None:
def read_projects_jsonl(projects_jsonl: Path) -> dict[str, Repository]:
"""Read either of the two formats of https://github.com/akx/ruff-usage-aggregate."""
repositories = {}
for line in projects_jsonl.read_text().splitlines():
data = json.loads(line)
# Check the input format.
if "items" in data:
for item in data["items"]:
# Pick only the easier case for now.
if item["path"] != "pyproject.toml":
continue
repository = item["repository"]
assert re.fullmatch(r"[a-zA-Z0-9_.-]+", repository["name"]), repository[
"name"
]
# GitHub doesn't give us any branch or pure rev info. This would give
# us the revision, but there's no way with git to just do
# `git clone --depth 1` with a specific ref.
# `ref = item["url"].split("?ref=")[1]` would be exact
repositories[repository["name"]] = Repository(
repository["owner"]["login"],
repository["name"],
None,
)
else:
assert "owner" in data, "Unknown ruff-usage-aggregate format"
# Pick only the easier case for now.
if data["path"] != "pyproject.toml":
continue
repositories[data["repo"]] = Repository(
data["owner"],
data["repo"],
data.get("ref"),
)
return repositories
async def main(*, ruff1: Path, ruff2: Path, projects_jsonl: Optional[Path]) -> None:
"""Check two versions of ruff against a corpus of open-source code."""
if projects_jsonl:
repositories = read_projects_jsonl(projects_jsonl)
else:
repositories = REPOSITORIES
logger.debug(f"Checking {len(repositories)} projects")
results = await asyncio.gather(
*[compare(ruff1, ruff2, repo) for repo in REPOSITORIES.values()],
*[compare(ruff1, ruff2, repo) for repo in repositories.values()],
return_exceptions=True,
)
diffs = dict(zip(REPOSITORIES, results, strict=True))
diffs = dict(zip(repositories, results, strict=True))
total_removed = total_added = 0
errors = 0
@ -202,6 +264,11 @@ async def main(*, ruff1: Path, ruff2: Path) -> None:
if isinstance(diff, Exception):
changes = "error"
print(f"<details><summary>{name} ({changes})</summary>")
repo = repositories[name]
print(
f"https://github.com/{repo.org}/{repo.repo} ref {repo.ref} "
f"select {repo.select} ignore {repo.ignore} exclude {repo.exclude}",
)
print("<p>")
print()
@ -230,6 +297,8 @@ async def main(*, ruff1: Path, ruff2: Path) -> None:
else:
continue
logger.debug(f"Finished {len(repositories)} repositories")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
@ -237,6 +306,20 @@ if __name__ == "__main__":
epilog="scripts/check_ecosystem.py <path/to/ruff1> <path/to/ruff2>",
)
parser.add_argument(
"--projects",
type=Path,
help=(
"Optional JSON files to use over the default repositories. "
"Supports both github_search_*.jsonl and known-github-tomls.jsonl."
),
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Activate debug logging",
)
parser.add_argument(
"ruff1",
type=Path,
@ -248,4 +331,9 @@ if __name__ == "__main__":
args = parser.parse_args()
asyncio.run(main(ruff1=args.ruff1, ruff2=args.ruff2))
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
asyncio.run(main(ruff1=args.ruff1, ruff2=args.ruff2, projects_jsonl=args.projects))

View file

@ -14,6 +14,8 @@ ignore = [
"C901", # McCabe complexity
"PL", # pylint
"S", # bandit
"G", # flake8-logging
"T", # flake8-print
]
[tool.ruff.pydocstyle]