Ecosystem CI: Allow storing checkouts locally (#4192)

* Ecosystem CI: Allow storing checkouts locally

This adds a --checkouts options to (re)use a local directory instead of checkouts into a tempdir

* Fix missing path conversion
This commit is contained in:
konstin 2023-05-11 17:36:44 +02:00 committed by GitHub
parent 3c2f41b615
commit 6a52577630
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 114 additions and 63 deletions

2
.gitignore vendored
View file

@ -3,8 +3,8 @@
crates/ruff/resources/test/cpython
mkdocs.yml
.overrides
github_search.jsonl
ruff-old
github_search*.jsonl
###
# Rust.gitignore

View file

@ -17,6 +17,12 @@
# docker buildx build -f scripts/Dockerfile.ecosystem -t ruff-ecosystem-checker --load .
# docker run --rm -v ./target/x86_64-unknown-linux-musl/debug/ruff:/app/ruff-new -v ./ruff-old:/app/ruff-old ruff-ecosystem-checker
# ```
# You can customize this, e.g. cache the git checkouts and use a custom json file:
# ```
# docker run -v ./target/x86_64-unknown-linux-musl/debug/ruff:/app/ruff-new -v ./ruff-old:/app/ruff-old \
# -v ./target/checkouts:/app/checkouts -v ./github_search.jsonl:/app/github_search.jsonl \
# --rm ruff-ecosystem-checker python check_ecosystem.py -v ruff-new ruff-old --checkouts checkouts > output.txt
# ```
FROM python:3.11
RUN mkdir /app

View file

@ -14,8 +14,9 @@ import json
import logging
import re
import tempfile
import time
from asyncio.subprocess import PIPE, create_subprocess_exec
from contextlib import asynccontextmanager
from contextlib import asynccontextmanager, nullcontext
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple, Optional, Self
@ -36,9 +37,13 @@ class Repository(NamedTuple):
exclude: str = ""
@asynccontextmanager
async def clone(self: Self) -> "AsyncIterator[Path]":
async def clone(self: Self, checkout_dir: Path) -> "AsyncIterator[Path]":
"""Shallow clone this repository to a temporary directory."""
with tempfile.TemporaryDirectory() as tmpdir:
if checkout_dir.exists():
logger.debug(f"Reusing {self.org}/{self.repo}")
yield Path(checkout_dir)
return
logger.debug(f"Cloning {self.org}/{self.repo}")
git_command = [
"git",
@ -56,7 +61,7 @@ class Repository(NamedTuple):
git_command.extend(
[
f"https://github.com/{self.org}/{self.repo}",
tmpdir,
checkout_dir,
],
)
@ -66,7 +71,7 @@ class Repository(NamedTuple):
logger.debug(f"Finished cloning {self.org}/{self.repo}")
yield Path(tmpdir)
yield Path(checkout_dir)
REPOSITORIES = {
@ -106,6 +111,8 @@ async def check(
ruff_args.extend(["--ignore", ignore])
if exclude:
ruff_args.extend(["--exclude", exclude])
start = time.time()
proc = await create_subprocess_exec(
ruff.absolute(),
*ruff_args,
@ -114,10 +121,10 @@ async def check(
stderr=PIPE,
cwd=path,
)
result, err = await proc.communicate()
end = time.time()
logger.debug(f"Finished checking {name} with {ruff}")
logger.debug(f"Finished checking {name} with {ruff} in {end - start:.2f}")
if proc.returncode != 0:
raise RuffError(err.decode("utf8"))
@ -150,11 +157,28 @@ class Diff(NamedTuple):
yield f"+ {line}"
async def compare(ruff1: Path, ruff2: Path, repo: Repository) -> Diff | None:
async def compare(
ruff1: Path,
ruff2: Path,
repo: Repository,
checkouts: Optional[Path] = None,
) -> Diff | None:
"""Check a specific repository against two versions of ruff."""
removed, added = set(), set()
async with repo.clone() as path:
# Allows to keep the checkouts locations
if checkouts:
checkout_dir = checkouts.joinpath(repo.org).joinpath(repo.repo)
# Don't create the repodir itself, we need that for checking for existing
# clones
checkout_dir.parent.mkdir(exist_ok=True, parents=True)
location_context = nullcontext(checkout_dir)
else:
location_context = tempfile.TemporaryDirectory()
with location_context as checkout_dir:
checkout_dir = Path(checkout_dir)
async with repo.clone(checkout_dir) as path:
try:
async with asyncio.TaskGroup() as tg:
check1 = tg.create_task(
@ -226,7 +250,13 @@ def read_projects_jsonl(projects_jsonl: Path) -> dict[str, Repository]:
return repositories
async def main(*, ruff1: Path, ruff2: Path, projects_jsonl: Optional[Path]) -> None:
async def main(
*,
ruff1: Path,
ruff2: Path,
projects_jsonl: Optional[Path],
checkouts: Optional[Path] = None,
) -> None:
"""Check two versions of ruff against a corpus of open-source code."""
if projects_jsonl:
repositories = read_projects_jsonl(projects_jsonl)
@ -236,7 +266,7 @@ async def main(*, ruff1: Path, ruff2: Path, projects_jsonl: Optional[Path]) -> N
logger.debug(f"Checking {len(repositories)} projects")
results = await asyncio.gather(
*[compare(ruff1, ruff2, repo) for repo in repositories.values()],
*[compare(ruff1, ruff2, repo, checkouts) for repo in repositories.values()],
return_exceptions=True,
)
@ -353,6 +383,14 @@ if __name__ == "__main__":
"Supports both github_search_*.jsonl and known-github-tomls.jsonl."
),
)
parser.add_argument(
"--checkouts",
type=Path,
help=(
"Location for the git checkouts, in case you want to save them"
" (defaults to temporary directory)"
),
)
parser.add_argument(
"-v",
"--verbose",
@ -375,4 +413,11 @@ if __name__ == "__main__":
else:
logging.basicConfig(level=logging.INFO)
asyncio.run(main(ruff1=args.ruff1, ruff2=args.ruff2, projects_jsonl=args.projects))
asyncio.run(
main(
ruff1=args.ruff1,
ruff2=args.ruff2,
projects_jsonl=args.projects,
checkouts=args.checkouts,
),
)