mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-03 18:28:24 +00:00
Extended ecosystem check with scraped data (#3858)
This commit is contained in:
parent
cae5503e34
commit
454c6d9c2f
5 changed files with 140 additions and 13 deletions
22
scripts/Dockerfile.ecosystem
Normal file
22
scripts/Dockerfile.ecosystem
Normal file
|
@ -0,0 +1,22 @@
|
|||
# [crater](https://github.com/rust-lang/crater)-inspired check that tests against a large number of
|
||||
# projects, mainly from https://github.com/akx/ruff-usage-aggregate.
|
||||
#
|
||||
# We run this in a Docker container as Ruff isn't designed for untrusted inputs.
|
||||
#
|
||||
# Either download https://github.com/akx/ruff-usage-aggregate/blob/master/data/known-github-tomls.jsonl as
|
||||
# `github_search.jsonl` or follow the instructions in the README to scrape your own dataset.
|
||||
#
|
||||
# From the project root:
|
||||
# ```
|
||||
# cargo build
|
||||
# docker buildx build -f scripts/Dockerfile.ecosystem -t ruff-ecosystem-checker --load .
|
||||
# docker run --rm ruff-ecosystem-checker
|
||||
# ```
|
||||
|
||||
FROM python:3.11
|
||||
RUN python -m venv .venv && .venv/bin/pip install ruff
|
||||
ADD scripts/check_ecosystem.py check_ecosystem.py
|
||||
ADD github_search.jsonl github_search.jsonl
|
||||
ADD target/debug/ruff ruff-new
|
||||
|
||||
CMD ["python", "check_ecosystem.py", "--verbose", "--projects", "github_search.jsonl", "ruff-new", ".venv/bin/ruff"]
|
|
@ -6,29 +6,31 @@ Example usage:
|
|||
scripts/check_ecosystem.py <path/to/ruff1> <path/to/ruff2>
|
||||
"""
|
||||
|
||||
# ruff: noqa: T201
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import difflib
|
||||
import heapq
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from asyncio.subprocess import PIPE, create_subprocess_exec
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, NamedTuple, Self
|
||||
from typing import TYPE_CHECKING, NamedTuple, Optional, Self
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import AsyncIterator, Iterator, Sequence
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Repository(NamedTuple):
|
||||
"""A GitHub repository at a specific ref."""
|
||||
|
||||
org: str
|
||||
repo: str
|
||||
ref: str
|
||||
ref: Optional[str]
|
||||
select: str = ""
|
||||
ignore: str = ""
|
||||
exclude: str = ""
|
||||
|
@ -37,7 +39,8 @@ class Repository(NamedTuple):
|
|||
async def clone(self: Self) -> "AsyncIterator[Path]":
|
||||
"""Shallow clone this repository to a temporary directory."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
process = await create_subprocess_exec(
|
||||
logger.debug(f"Cloning {self.org}/{self.repo}")
|
||||
git_command = [
|
||||
"git",
|
||||
"clone",
|
||||
"--config",
|
||||
|
@ -46,14 +49,23 @@ class Repository(NamedTuple):
|
|||
"--depth",
|
||||
"1",
|
||||
"--no-tags",
|
||||
"--branch",
|
||||
self.ref,
|
||||
f"https://github.com/{self.org}/{self.repo}",
|
||||
tmpdir,
|
||||
]
|
||||
if self.ref:
|
||||
git_command.extend(["--branch", self.ref])
|
||||
|
||||
git_command.extend(
|
||||
[
|
||||
f"https://github.com/{self.org}/{self.repo}",
|
||||
tmpdir,
|
||||
],
|
||||
)
|
||||
|
||||
process = await create_subprocess_exec(*git_command)
|
||||
|
||||
await process.wait()
|
||||
|
||||
logger.debug(f"Finished cloning {self.org}/{self.repo}")
|
||||
|
||||
yield Path(tmpdir)
|
||||
|
||||
|
||||
|
@ -80,11 +92,13 @@ async def check(
|
|||
*,
|
||||
ruff: Path,
|
||||
path: Path,
|
||||
name: str,
|
||||
select: str = "",
|
||||
ignore: str = "",
|
||||
exclude: str = "",
|
||||
) -> "Sequence[str]":
|
||||
"""Run the given ruff binary against the specified path."""
|
||||
logger.debug(f"Checking {name} with {ruff}")
|
||||
ruff_args = ["check", "--no-cache", "--exit-zero"]
|
||||
if select:
|
||||
ruff_args.extend(["--select", select])
|
||||
|
@ -103,6 +117,8 @@ async def check(
|
|||
|
||||
result, err = await proc.communicate()
|
||||
|
||||
logger.debug(f"Finished checking {name} with {ruff}")
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise RuffError(err.decode("utf8"))
|
||||
|
||||
|
@ -145,6 +161,7 @@ async def compare(ruff1: Path, ruff2: Path, repo: Repository) -> Diff | None:
|
|||
check(
|
||||
ruff=ruff1,
|
||||
path=path,
|
||||
name=f"{repo.org}/{repo.repo}",
|
||||
select=repo.select,
|
||||
ignore=repo.ignore,
|
||||
exclude=repo.exclude,
|
||||
|
@ -154,6 +171,7 @@ async def compare(ruff1: Path, ruff2: Path, repo: Repository) -> Diff | None:
|
|||
check(
|
||||
ruff=ruff2,
|
||||
path=path,
|
||||
name=f"{repo.org}/{repo.repo}",
|
||||
select=repo.select,
|
||||
ignore=repo.ignore,
|
||||
exclude=repo.exclude,
|
||||
|
@ -171,14 +189,58 @@ async def compare(ruff1: Path, ruff2: Path, repo: Repository) -> Diff | None:
|
|||
return Diff(removed, added)
|
||||
|
||||
|
||||
async def main(*, ruff1: Path, ruff2: Path) -> None:
|
||||
def read_projects_jsonl(projects_jsonl: Path) -> dict[str, Repository]:
|
||||
"""Read either of the two formats of https://github.com/akx/ruff-usage-aggregate."""
|
||||
repositories = {}
|
||||
for line in projects_jsonl.read_text().splitlines():
|
||||
data = json.loads(line)
|
||||
# Check the input format.
|
||||
if "items" in data:
|
||||
for item in data["items"]:
|
||||
# Pick only the easier case for now.
|
||||
if item["path"] != "pyproject.toml":
|
||||
continue
|
||||
repository = item["repository"]
|
||||
assert re.fullmatch(r"[a-zA-Z0-9_.-]+", repository["name"]), repository[
|
||||
"name"
|
||||
]
|
||||
# GitHub doesn't give us any branch or pure rev info. This would give
|
||||
# us the revision, but there's no way with git to just do
|
||||
# `git clone --depth 1` with a specific ref.
|
||||
# `ref = item["url"].split("?ref=")[1]` would be exact
|
||||
repositories[repository["name"]] = Repository(
|
||||
repository["owner"]["login"],
|
||||
repository["name"],
|
||||
None,
|
||||
)
|
||||
else:
|
||||
assert "owner" in data, "Unknown ruff-usage-aggregate format"
|
||||
# Pick only the easier case for now.
|
||||
if data["path"] != "pyproject.toml":
|
||||
continue
|
||||
repositories[data["repo"]] = Repository(
|
||||
data["owner"],
|
||||
data["repo"],
|
||||
data.get("ref"),
|
||||
)
|
||||
return repositories
|
||||
|
||||
|
||||
async def main(*, ruff1: Path, ruff2: Path, projects_jsonl: Optional[Path]) -> None:
|
||||
"""Check two versions of ruff against a corpus of open-source code."""
|
||||
if projects_jsonl:
|
||||
repositories = read_projects_jsonl(projects_jsonl)
|
||||
else:
|
||||
repositories = REPOSITORIES
|
||||
|
||||
logger.debug(f"Checking {len(repositories)} projects")
|
||||
|
||||
results = await asyncio.gather(
|
||||
*[compare(ruff1, ruff2, repo) for repo in REPOSITORIES.values()],
|
||||
*[compare(ruff1, ruff2, repo) for repo in repositories.values()],
|
||||
return_exceptions=True,
|
||||
)
|
||||
|
||||
diffs = dict(zip(REPOSITORIES, results, strict=True))
|
||||
diffs = dict(zip(repositories, results, strict=True))
|
||||
|
||||
total_removed = total_added = 0
|
||||
errors = 0
|
||||
|
@ -202,6 +264,11 @@ async def main(*, ruff1: Path, ruff2: Path) -> None:
|
|||
if isinstance(diff, Exception):
|
||||
changes = "error"
|
||||
print(f"<details><summary>{name} ({changes})</summary>")
|
||||
repo = repositories[name]
|
||||
print(
|
||||
f"https://github.com/{repo.org}/{repo.repo} ref {repo.ref} "
|
||||
f"select {repo.select} ignore {repo.ignore} exclude {repo.exclude}",
|
||||
)
|
||||
print("<p>")
|
||||
print()
|
||||
|
||||
|
@ -230,6 +297,8 @@ async def main(*, ruff1: Path, ruff2: Path) -> None:
|
|||
else:
|
||||
continue
|
||||
|
||||
logger.debug(f"Finished {len(repositories)} repositories")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
|
@ -237,6 +306,20 @@ if __name__ == "__main__":
|
|||
epilog="scripts/check_ecosystem.py <path/to/ruff1> <path/to/ruff2>",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--projects",
|
||||
type=Path,
|
||||
help=(
|
||||
"Optional JSON files to use over the default repositories. "
|
||||
"Supports both github_search_*.jsonl and known-github-tomls.jsonl."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Activate debug logging",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ruff1",
|
||||
type=Path,
|
||||
|
@ -248,4 +331,9 @@ if __name__ == "__main__":
|
|||
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(main(ruff1=args.ruff1, ruff2=args.ruff2))
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
asyncio.run(main(ruff1=args.ruff1, ruff2=args.ruff2, projects_jsonl=args.projects))
|
||||
|
|
|
@ -14,6 +14,8 @@ ignore = [
|
|||
"C901", # McCabe complexity
|
||||
"PL", # pylint
|
||||
"S", # bandit
|
||||
"G", # flake8-logging
|
||||
"T", # flake8-print
|
||||
]
|
||||
|
||||
[tool.ruff.pydocstyle]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue