mirror of
				https://github.com/astral-sh/ruff.git
				synced 2025-10-30 11:36:57 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			548 lines
		
	
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			548 lines
		
	
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| **DEPRECATED** This script is being replaced by the ruff-ecosystem package.
 | |
| 
 | |
| 
 | |
| Check two versions of ruff against a corpus of open-source code.
 | |
| 
 | |
| Example usage:
 | |
| 
 | |
|     scripts/check_ecosystem.py <path/to/ruff1> <path/to/ruff2>
 | |
| """
 | |
| 
 | |
| from __future__ import annotations
 | |
| 
 | |
| import argparse
 | |
| import asyncio
 | |
| import difflib
 | |
| import heapq
 | |
| import json
 | |
| import logging
 | |
| import re
 | |
| import tempfile
 | |
| import time
 | |
| from asyncio.subprocess import PIPE, create_subprocess_exec
 | |
| from contextlib import asynccontextmanager, nullcontext
 | |
| from pathlib import Path
 | |
| from signal import SIGINT, SIGTERM
 | |
| from typing import TYPE_CHECKING, NamedTuple, Self, TypeVar
 | |
| 
 | |
| if TYPE_CHECKING:
 | |
|     from collections.abc import AsyncIterator, Iterator, Sequence
 | |
| 
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| 
 | |
| class Repository(NamedTuple):
 | |
|     """A GitHub repository at a specific ref."""
 | |
| 
 | |
|     org: str
 | |
|     repo: str
 | |
|     ref: str | None
 | |
|     select: str = ""
 | |
|     ignore: str = ""
 | |
|     exclude: str = ""
 | |
|     # Generating fixes is slow and verbose
 | |
|     show_fixes: bool = False
 | |
| 
 | |
|     @asynccontextmanager
 | |
|     async def clone(self: Self, checkout_dir: Path) -> AsyncIterator[Path]:
 | |
|         """Shallow clone this repository to a temporary directory."""
 | |
|         if checkout_dir.exists():
 | |
|             logger.debug(f"Reusing {self.org}:{self.repo}")
 | |
|             yield await self._get_commit(checkout_dir)
 | |
|             return
 | |
| 
 | |
|         logger.debug(f"Cloning {self.org}:{self.repo}")
 | |
|         git_clone_command = [
 | |
|             "git",
 | |
|             "clone",
 | |
|             "--config",
 | |
|             "advice.detachedHead=false",
 | |
|             "--quiet",
 | |
|             "--depth",
 | |
|             "1",
 | |
|             "--no-tags",
 | |
|         ]
 | |
|         if self.ref:
 | |
|             git_clone_command.extend(["--branch", self.ref])
 | |
| 
 | |
|         git_clone_command.extend(
 | |
|             [
 | |
|                 f"https://github.com/{self.org}/{self.repo}",
 | |
|                 checkout_dir,
 | |
|             ],
 | |
|         )
 | |
| 
 | |
|         git_clone_process = await create_subprocess_exec(
 | |
|             *git_clone_command,
 | |
|             env={"GIT_TERMINAL_PROMPT": "0"},
 | |
|         )
 | |
| 
 | |
|         status_code = await git_clone_process.wait()
 | |
| 
 | |
|         logger.debug(
 | |
|             f"Finished cloning {self.org}/{self.repo} with status {status_code}",
 | |
|         )
 | |
|         yield await self._get_commit(checkout_dir)
 | |
| 
 | |
|     def url_for(self: Self, commit_sha: str, path: str, lnum: int | None = None) -> str:
 | |
|         """
 | |
|         Return the GitHub URL for the given commit, path, and line number, if given.
 | |
|         """
 | |
|         # Default to main branch
 | |
|         url = f"https://github.com/{self.org}/{self.repo}/blob/{commit_sha}/{path}"
 | |
|         if lnum:
 | |
|             url += f"#L{lnum}"
 | |
|         return url
 | |
| 
 | |
|     async def _get_commit(self: Self, checkout_dir: Path) -> str:
 | |
|         """Return the commit sha for the repository in the checkout directory."""
 | |
|         git_sha_process = await create_subprocess_exec(
 | |
|             *["git", "rev-parse", "HEAD"],
 | |
|             cwd=checkout_dir,
 | |
|             stdout=PIPE,
 | |
|         )
 | |
|         git_sha_stdout, _ = await git_sha_process.communicate()
 | |
|         assert await git_sha_process.wait() == 0, (
 | |
|             f"Failed to retrieve commit sha at {checkout_dir}"
 | |
|         )
 | |
|         return git_sha_stdout.decode().strip()
 | |
| 
 | |
| 
 | |
| # Repositories to check
 | |
| # We check most repositories with the default ruleset instead of all rules to avoid
 | |
| # noisy reports when new rules are added; see https://github.com/astral-sh/ruff/pull/3590
 | |
| REPOSITORIES: list[Repository] = [
 | |
|     Repository("DisnakeDev", "disnake", "master"),
 | |
|     Repository("PostHog", "HouseWatch", "main"),
 | |
|     Repository("RasaHQ", "rasa", "main"),
 | |
|     Repository("Snowflake-Labs", "snowcli", "main"),
 | |
|     Repository("aiven", "aiven-client", "main"),
 | |
|     Repository("alteryx", "featuretools", "main"),
 | |
|     Repository("apache", "airflow", "main", select="ALL"),
 | |
|     Repository("apache", "superset", "master", select="ALL"),
 | |
|     Repository("aws", "aws-sam-cli", "develop"),
 | |
|     Repository("binary-husky", "gpt_academic", "master"),
 | |
|     Repository("bloomberg", "pytest-memray", "main"),
 | |
|     Repository("bokeh", "bokeh", "branch-3.3", select="ALL"),
 | |
|     # Disabled due to use of explicit `select` with `E999`, which has been removed.
 | |
|     # See: https://github.com/astral-sh/ruff/pull/12129
 | |
|     # Repository("demisto", "content", "master"),
 | |
|     Repository("docker", "docker-py", "main"),
 | |
|     Repository("facebookresearch", "chameleon", "main"),
 | |
|     Repository("freedomofpress", "securedrop", "develop"),
 | |
|     Repository("fronzbot", "blinkpy", "dev"),
 | |
|     Repository("ibis-project", "ibis", "master"),
 | |
|     Repository("ing-bank", "probatus", "main"),
 | |
|     Repository("jrnl-org", "jrnl", "develop"),
 | |
|     Repository("langchain-ai", "langchain", "main"),
 | |
|     Repository("latchbio", "latch", "main"),
 | |
|     Repository("lnbits", "lnbits", "main"),
 | |
|     Repository("milvus-io", "pymilvus", "master"),
 | |
|     Repository("mlflow", "mlflow", "master"),
 | |
|     Repository("model-bakers", "model_bakery", "main"),
 | |
|     Repository("pandas-dev", "pandas", "main"),
 | |
|     Repository("prefecthq", "prefect", "main"),
 | |
|     Repository("pypa", "build", "main"),
 | |
|     Repository("pypa", "cibuildwheel", "main"),
 | |
|     Repository("pypa", "pip", "main"),
 | |
|     Repository("pypa", "setuptools", "main"),
 | |
|     Repository("python", "mypy", "master"),
 | |
|     Repository("python", "typeshed", "main", select="PYI"),
 | |
|     Repository("python-poetry", "poetry", "master"),
 | |
|     Repository("qdrant", "qdrant-client", "master"),
 | |
|     Repository("reflex-dev", "reflex", "main"),
 | |
|     Repository("rotki", "rotki", "develop"),
 | |
|     Repository("scikit-build", "scikit-build", "main"),
 | |
|     Repository("scikit-build", "scikit-build-core", "main"),
 | |
|     Repository("sphinx-doc", "sphinx", "master"),
 | |
|     Repository("spruceid", "siwe-py", "main"),
 | |
|     Repository("tiangolo", "fastapi", "master"),
 | |
|     Repository("yandex", "ch-backup", "main"),
 | |
|     Repository("zulip", "zulip", "main", select="ALL"),
 | |
| ]
 | |
| 
 | |
| SUMMARY_LINE_RE = re.compile(r"^(Found \d+ error.*)|(.*potentially fixable with.*)$")
 | |
| 
 | |
| 
 | |
| class RuffError(Exception):
 | |
|     """An error reported by ruff."""
 | |
| 
 | |
| 
 | |
| async def check(
 | |
|     *,
 | |
|     ruff: Path,
 | |
|     path: Path,
 | |
|     name: str,
 | |
|     select: str = "",
 | |
|     ignore: str = "",
 | |
|     exclude: str = "",
 | |
|     show_fixes: bool = False,
 | |
| ) -> Sequence[str]:
 | |
|     """Run the given ruff binary against the specified path."""
 | |
|     logger.debug(f"Checking {name} with {ruff}")
 | |
|     ruff_args = ["check", "--no-cache", "--exit-zero"]
 | |
|     if select:
 | |
|         ruff_args.extend(["--select", select])
 | |
|     if ignore:
 | |
|         ruff_args.extend(["--ignore", ignore])
 | |
|     if exclude:
 | |
|         ruff_args.extend(["--exclude", exclude])
 | |
|     if show_fixes:
 | |
|         ruff_args.extend(["--show-fixes"])
 | |
| 
 | |
|     start = time.time()
 | |
|     proc = await create_subprocess_exec(
 | |
|         ruff.absolute(),
 | |
|         *ruff_args,
 | |
|         ".",
 | |
|         stdout=PIPE,
 | |
|         stderr=PIPE,
 | |
|         cwd=path,
 | |
|     )
 | |
|     result, err = await proc.communicate()
 | |
|     end = time.time()
 | |
| 
 | |
|     logger.debug(f"Finished checking {name} with {ruff} in {end - start:.2f}")
 | |
| 
 | |
|     if proc.returncode != 0:
 | |
|         raise RuffError(err.decode("utf8"))
 | |
| 
 | |
|     lines = [
 | |
|         line
 | |
|         for line in result.decode("utf8").splitlines()
 | |
|         if not SUMMARY_LINE_RE.match(line)
 | |
|     ]
 | |
| 
 | |
|     return sorted(lines)
 | |
| 
 | |
| 
 | |
| class Diff(NamedTuple):
 | |
|     """A diff between two runs of ruff."""
 | |
| 
 | |
|     removed: set[str]
 | |
|     added: set[str]
 | |
|     source_sha: str
 | |
| 
 | |
|     def __bool__(self: Self) -> bool:
 | |
|         """Return true if this diff is non-empty."""
 | |
|         return bool(self.removed or self.added)
 | |
| 
 | |
|     def __iter__(self: Self) -> Iterator[str]:
 | |
|         """Iterate through the changed lines in diff format."""
 | |
|         for line in heapq.merge(sorted(self.removed), sorted(self.added)):
 | |
|             if line in self.removed:
 | |
|                 yield f"- {line}"
 | |
|             else:
 | |
|                 yield f"+ {line}"
 | |
| 
 | |
| 
 | |
| async def compare(
 | |
|     ruff1: Path,
 | |
|     ruff2: Path,
 | |
|     repo: Repository,
 | |
|     checkouts: Path | None = None,
 | |
| ) -> Diff | None:
 | |
|     """Check a specific repository against two versions of ruff."""
 | |
|     removed, added = set(), set()
 | |
| 
 | |
|     # By the default, the git clone are transient, but if the user provides a
 | |
|     # directory for permanent storage we keep it there
 | |
|     if checkouts:
 | |
|         location_context = nullcontext(checkouts)
 | |
|     else:
 | |
|         location_context = tempfile.TemporaryDirectory()
 | |
| 
 | |
|     with location_context as checkout_parent:
 | |
|         assert ":" not in repo.org
 | |
|         assert ":" not in repo.repo
 | |
|         checkout_dir = Path(checkout_parent).joinpath(f"{repo.org}:{repo.repo}")
 | |
|         async with repo.clone(checkout_dir) as checkout_sha:
 | |
|             try:
 | |
|                 async with asyncio.TaskGroup() as tg:
 | |
|                     check1 = tg.create_task(
 | |
|                         check(
 | |
|                             ruff=ruff1,
 | |
|                             path=checkout_dir,
 | |
|                             name=f"{repo.org}/{repo.repo}",
 | |
|                             select=repo.select,
 | |
|                             ignore=repo.ignore,
 | |
|                             exclude=repo.exclude,
 | |
|                             show_fixes=repo.show_fixes,
 | |
|                         ),
 | |
|                     )
 | |
|                     check2 = tg.create_task(
 | |
|                         check(
 | |
|                             ruff=ruff2,
 | |
|                             path=checkout_dir,
 | |
|                             name=f"{repo.org}/{repo.repo}",
 | |
|                             select=repo.select,
 | |
|                             ignore=repo.ignore,
 | |
|                             exclude=repo.exclude,
 | |
|                             show_fixes=repo.show_fixes,
 | |
|                         ),
 | |
|                     )
 | |
|             except ExceptionGroup as e:
 | |
|                 raise e.exceptions[0] from e
 | |
| 
 | |
|             for line in difflib.ndiff(check1.result(), check2.result()):
 | |
|                 if line.startswith("- "):
 | |
|                     removed.add(line[2:])
 | |
|                 elif line.startswith("+ "):
 | |
|                     added.add(line[2:])
 | |
| 
 | |
|     return Diff(removed, added, checkout_sha)
 | |
| 
 | |
| 
 | |
| def read_projects_jsonl(projects_jsonl: Path) -> dict[tuple[str, str], Repository]:
 | |
|     """Read either of the two formats of https://github.com/akx/ruff-usage-aggregate."""
 | |
|     repositories = {}
 | |
|     for line in projects_jsonl.read_text().splitlines():
 | |
|         data = json.loads(line)
 | |
|         # Check the input format.
 | |
|         if "items" in data:
 | |
|             for item in data["items"]:
 | |
|                 # Pick only the easier case for now.
 | |
|                 if item["path"] != "pyproject.toml":
 | |
|                     continue
 | |
|                 repository = item["repository"]
 | |
|                 assert re.fullmatch(r"[a-zA-Z0-9_.-]+", repository["name"]), repository[
 | |
|                     "name"
 | |
|                 ]
 | |
|                 # GitHub doesn't give us any branch or pure rev info.  This would give
 | |
|                 # us the revision, but there's no way with git to just do
 | |
|                 # `git clone --depth 1` with a specific ref.
 | |
|                 # `ref = item["url"].split("?ref=")[1]` would be exact
 | |
|                 repositories[(repository["owner"], repository["repo"])] = Repository(
 | |
|                     repository["owner"]["login"],
 | |
|                     repository["name"],
 | |
|                     None,
 | |
|                     select=repository.get("select"),
 | |
|                     ignore=repository.get("ignore"),
 | |
|                     exclude=repository.get("exclude"),
 | |
|                 )
 | |
|         else:
 | |
|             assert "owner" in data, "Unknown ruff-usage-aggregate format"
 | |
|             # Pick only the easier case for now.
 | |
|             if data["path"] != "pyproject.toml":
 | |
|                 continue
 | |
|             repositories[(data["owner"], data["repo"])] = Repository(
 | |
|                 data["owner"],
 | |
|                 data["repo"],
 | |
|                 data.get("ref"),
 | |
|                 select=data.get("select"),
 | |
|                 ignore=data.get("ignore"),
 | |
|                 exclude=data.get("exclude"),
 | |
|             )
 | |
|     return repositories
 | |
| 
 | |
| 
 | |
| DIFF_LINE_RE = re.compile(
 | |
|     r"^(?P<pre>[+-]) (?P<inner>(?P<path>[^:]+):(?P<lnum>\d+):\d+:) (?P<post>.*)$",
 | |
| )
 | |
| 
 | |
| T = TypeVar("T")
 | |
| 
 | |
| 
 | |
| async def main(
 | |
|     *,
 | |
|     ruff1: Path,
 | |
|     ruff2: Path,
 | |
|     projects_jsonl: Path | None,
 | |
|     checkouts: Path | None = None,
 | |
| ) -> None:
 | |
|     """Check two versions of ruff against a corpus of open-source code."""
 | |
|     if projects_jsonl:
 | |
|         repositories = read_projects_jsonl(projects_jsonl)
 | |
|     else:
 | |
|         repositories = {(repo.org, repo.repo): repo for repo in REPOSITORIES}
 | |
| 
 | |
|     logger.debug(f"Checking {len(repositories)} projects")
 | |
| 
 | |
|     # https://stackoverflow.com/a/61478547/3549270
 | |
|     # Otherwise doing 3k repositories can take >8GB RAM
 | |
|     semaphore = asyncio.Semaphore(50)
 | |
| 
 | |
|     async def limited_parallelism(coroutine: T) -> T:
 | |
|         async with semaphore:
 | |
|             return await coroutine
 | |
| 
 | |
|     results = await asyncio.gather(
 | |
|         *[
 | |
|             limited_parallelism(compare(ruff1, ruff2, repo, checkouts))
 | |
|             for repo in repositories.values()
 | |
|         ],
 | |
|         return_exceptions=True,
 | |
|     )
 | |
| 
 | |
|     diffs = dict(zip(repositories, results, strict=True))
 | |
| 
 | |
|     total_removed = total_added = 0
 | |
|     errors = 0
 | |
| 
 | |
|     for diff in diffs.values():
 | |
|         if isinstance(diff, Exception):
 | |
|             errors += 1
 | |
|         else:
 | |
|             total_removed += len(diff.removed)
 | |
|             total_added += len(diff.added)
 | |
| 
 | |
|     if total_removed == 0 and total_added == 0 and errors == 0:
 | |
|         print("\u2705 ecosystem check detected no changes.")
 | |
|     else:
 | |
|         rule_changes: dict[str, tuple[int, int]] = {}
 | |
|         changes = f"(+{total_added}, -{total_removed}, {errors} error(s))"
 | |
| 
 | |
|         print(f"\u2139\ufe0f ecosystem check **detected changes**. {changes}")
 | |
|         print()
 | |
| 
 | |
|         for (org, repo), diff in diffs.items():
 | |
|             if isinstance(diff, Exception):
 | |
|                 changes = "error"
 | |
|                 print(f"<details><summary>{repo} ({changes})</summary>")
 | |
|                 repo = repositories[(org, repo)]
 | |
|                 print(
 | |
|                     f"https://github.com/{repo.org}/{repo.repo} ref {repo.ref} "
 | |
|                     f"select {repo.select} ignore {repo.ignore} exclude {repo.exclude}",
 | |
|                 )
 | |
|                 print("<p>")
 | |
|                 print()
 | |
| 
 | |
|                 print("```")
 | |
|                 print(str(diff))
 | |
|                 print("```")
 | |
| 
 | |
|                 print()
 | |
|                 print("</p>")
 | |
|                 print("</details>")
 | |
|             elif diff:
 | |
|                 changes = f"+{len(diff.added)}, -{len(diff.removed)}"
 | |
|                 print(f"<details><summary>{repo} ({changes})</summary>")
 | |
|                 print("<p>")
 | |
|                 print()
 | |
| 
 | |
|                 repo = repositories[(org, repo)]
 | |
|                 diff_lines = list(diff)
 | |
| 
 | |
|                 print("<pre>")
 | |
|                 for line in diff_lines:
 | |
|                     match = DIFF_LINE_RE.match(line)
 | |
|                     if match is None:
 | |
|                         print(line)
 | |
|                         continue
 | |
| 
 | |
|                     pre, inner, path, lnum, post = match.groups()
 | |
|                     url = repo.url_for(diff.source_sha, path, int(lnum))
 | |
|                     print(f"{pre} <a href='{url}'>{inner}</a> {post}")
 | |
|                 print("</pre>")
 | |
| 
 | |
|                 print()
 | |
|                 print("</p>")
 | |
|                 print("</details>")
 | |
| 
 | |
|                 # Count rule changes
 | |
|                 for line in diff_lines:
 | |
|                     # Find rule change for current line or construction
 | |
|                     # + <rule>/<path>:<line>:<column>: <rule_code> <message>
 | |
|                     matches = re.search(r": ([A-Z]{1,4}[0-9]{3,4})", line)
 | |
| 
 | |
|                     if matches is None:
 | |
|                         # Handle case where there are no regex matches e.g.
 | |
|                         # +                 "?application=AIRFLOW&authenticator=TEST_AUTH&role=TEST_ROLE&warehouse=TEST_WAREHOUSE"
 | |
|                         # Which was found in local testing
 | |
|                         continue
 | |
| 
 | |
|                     rule_code = matches.group(1)
 | |
| 
 | |
|                     # Get current additions and removals for this rule
 | |
|                     current_changes = rule_changes.get(rule_code, (0, 0))
 | |
| 
 | |
|                     # Check if addition or removal depending on the first character
 | |
|                     if line[0] == "+":
 | |
|                         current_changes = (current_changes[0] + 1, current_changes[1])
 | |
|                     elif line[0] == "-":
 | |
|                         current_changes = (current_changes[0], current_changes[1] + 1)
 | |
| 
 | |
|                     rule_changes[rule_code] = current_changes
 | |
| 
 | |
|             else:
 | |
|                 continue
 | |
| 
 | |
|         if len(rule_changes.keys()) > 0:
 | |
|             print(f"Rules changed: {len(rule_changes.keys())}")
 | |
|             print()
 | |
|             print("| Rule | Changes | Additions | Removals |")
 | |
|             print("| ---- | ------- | --------- | -------- |")
 | |
|             for rule, (additions, removals) in sorted(
 | |
|                 rule_changes.items(),
 | |
|                 key=lambda x: (x[1][0] + x[1][1]),
 | |
|                 reverse=True,
 | |
|             ):
 | |
|                 print(f"| {rule} | {additions + removals} | {additions} | {removals} |")
 | |
| 
 | |
|     logger.debug(f"Finished {len(repositories)} repositories")
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     parser = argparse.ArgumentParser(
 | |
|         description="Check two versions of ruff against a corpus of open-source code.",
 | |
|         epilog="scripts/check_ecosystem.py <path/to/ruff1> <path/to/ruff2>",
 | |
|     )
 | |
| 
 | |
|     parser.add_argument(
 | |
|         "--projects",
 | |
|         type=Path,
 | |
|         help=(
 | |
|             "Optional JSON files to use over the default repositories. "
 | |
|             "Supports both github_search_*.jsonl and known-github-tomls.jsonl."
 | |
|         ),
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         "--checkouts",
 | |
|         type=Path,
 | |
|         help=(
 | |
|             "Location for the git checkouts, in case you want to save them"
 | |
|             " (defaults to temporary directory)"
 | |
|         ),
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         "-v",
 | |
|         "--verbose",
 | |
|         action="store_true",
 | |
|         help="Activate debug logging",
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         "ruff1",
 | |
|         type=Path,
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         "ruff2",
 | |
|         type=Path,
 | |
|     )
 | |
| 
 | |
|     args = parser.parse_args()
 | |
| 
 | |
|     if args.verbose:
 | |
|         logging.basicConfig(level=logging.DEBUG)
 | |
|     else:
 | |
|         logging.basicConfig(level=logging.INFO)
 | |
| 
 | |
|     loop = asyncio.get_event_loop()
 | |
|     if args.checkouts:
 | |
|         args.checkouts.mkdir(exist_ok=True, parents=True)
 | |
|     main_task = asyncio.ensure_future(
 | |
|         main(
 | |
|             ruff1=args.ruff1,
 | |
|             ruff2=args.ruff2,
 | |
|             projects_jsonl=args.projects,
 | |
|             checkouts=args.checkouts,
 | |
|         ),
 | |
|     )
 | |
|     # https://stackoverflow.com/a/58840987/3549270
 | |
|     for signal in [SIGINT, SIGTERM]:
 | |
|         loop.add_signal_handler(signal, main_task.cancel)
 | |
|     try:
 | |
|         loop.run_until_complete(main_task)
 | |
|     finally:
 | |
|         loop.close()
 | 
