Rewrite ecosystem checks and add ruff format reports (#8223)

Closes #7239 - Refactors `scripts/check_ecosystem.py` into a new Python project at `python/ruff-ecosystem` - Includes [documentation](https://github.com/astral-sh/ruff/blob/zanie/ecosystem-format/python/ruff-ecosystem/README.md) now - Provides a `ruff-ecosystem` CLI - Fixes bug where `ruff check` report included "fixable" summary line - Adds truncation to `ruff check` reports - Otherwise we often won't see the `ruff format` reports - The truncation uses some very simple heuristics and could be improved in the future - Identifies diagnostic changes that occur just because a violation's fix available changes - We still show the diff for the line because it's could matter _where_ this changes, but we could improve this - Similarly, we could improve detection of diagnostic changes where just the message changes - Adds support for JSON ecosystem check output - I added this primarily for development purposes - If there are no changes, only errors while processing projects, we display a different summary message - When caching repositories, we now checkout the requested ref - Adds `ruff format` reports, which format with the baseline then the use `format --diff` to generate a report - Runs all CI jobs when the CI workflow is changed ## Known problems - Since we must format the project to get a baseline, the permalink line numbers do not exactly correspond to the correct range - This looks... hard. I tried using `git diff` and some wonky hunk matching to recover the original line numbers but it doesn't seem worth it. I think we should probably commit the formatted changes to a fork or something if we want great results here. Consequently, I've just used the start line instead of a range for now. - I don't love the comment structure — it'd be nice, perhaps, to have separate headings for the linter and formatter. - However, the `pr-comment` workflow is an absolute pain to change because it runs _separately_ from this pull request so I if I want to make edits to it I can only test it via manual workflow dispatch. - Lines are not printed "as we go" which means they're all held in memory, presumably this would be a problem for large-scale ecosystem checks - We are encountering a hard limit with the maximum comment length supported by GitHub. We will need to move the bulk of the report elsewhere. ## Future work - Update `ruff-ecosystem` to support non-default projects and `check_ecosystem_all.py` behavior - Remove existing ecosystem check scripts - Add preview mode toggle (#8076) - Add a toggle for truncation - Add hints for quick reproduction of runs locally - Consider parsing JSON output of Ruff instead of using regex to parse the text output - Links to project repositories should use the commit hash we checked against - When caching repositories, we should pull the latest changes for the ref - Sort check diffs by path and rule code only (changes in messages should not change order) - Update check diffs to distinguish between new violations and changes in messages - Add "fix" diffs - Remove existing formatter similarity reports - On release pull request, compare to the previous tag instead --------- Co-authored-by: konsti <konstin@mailbox.org>
2025-09-29 21:34:57 +00:00 · 2023-10-27 17:28:01 -05:00 · 2023-10-27 17:28:01 -05:00 · fc94857a20
commit fc94857a20
parent 5f26411577
14 changed files with 1555 additions and 8 deletions
--- a/python/ruff-ecosystem/ruff_ecosystem/cli.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/cli.py
@ -0,0 +1,166 @@
+import argparse
+import asyncio
+import logging
+import os
+import shutil
+import sys
+import sysconfig
+import tempfile
+from contextlib import nullcontext
+from pathlib import Path
+from signal import SIGINT, SIGTERM
+
+from ruff_ecosystem import logger
+from ruff_ecosystem.defaults import DEFAULT_TARGETS
+from ruff_ecosystem.main import OutputFormat, main
+from ruff_ecosystem.projects import RuffCommand
+
+
+def excepthook(type, value, tb):
+    if hasattr(sys, "ps1") or not sys.stderr.isatty():
+        # we are in interactive mode or we don't have a tty so call the default
+        sys.__excepthook__(type, value, tb)
+    else:
+        import pdb
+        import traceback
+
+        traceback.print_exception(type, value, tb)
+        print()
+        pdb.post_mortem(tb)
+
+
+def entrypoint():
+    args = parse_args()
+
+    if args.pdb:
+        sys.excepthook = excepthook
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    # Use a temporary directory for caching if no cache is specified
+    cache_context = (
+        tempfile.TemporaryDirectory() if not args.cache else nullcontext(args.cache)
+    )
+
+    ruff_baseline = args.ruff_baseline
+    if not args.ruff_baseline.exists():
+        ruff_baseline = get_executable_path(str(args.ruff_baseline))
+        if not ruff_baseline:
+            print(
+                f"Could not find ruff baseline executable: {args.ruff_baseline}",
+                sys.stderr,
+            )
+            exit(1)
+        logger.info(
+            "Resolved baseline executable %s to %s", args.ruff_baseline, ruff_baseline
+        )
+
+    ruff_comparison = args.ruff_comparison
+    if not args.ruff_comparison.exists():
+        ruff_comparison = get_executable_path(str(args.ruff_comparison))
+        if not ruff_comparison:
+            print(
+                f"Could not find ruff comparison executable: {args.ruff_comparison}",
+                sys.stderr,
+            )
+            exit(1)
+        logger.info(
+            "Resolved comparison executable %s to %s",
+            args.ruff_comparison,
+            ruff_comparison,
+        )
+
+    with cache_context as cache:
+        loop = asyncio.get_event_loop()
+        main_task = asyncio.ensure_future(
+            main(
+                command=RuffCommand(args.ruff_command),
+                ruff_baseline_executable=ruff_baseline,
+                ruff_comparison_executable=ruff_comparison,
+                targets=DEFAULT_TARGETS,
+                format=OutputFormat(args.output_format),
+                project_dir=Path(cache),
+                raise_on_failure=args.pdb,
+            )
+        )
+        # https://stackoverflow.com/a/58840987/3549270
+        for signal in [SIGINT, SIGTERM]:
+            loop.add_signal_handler(signal, main_task.cancel)
+        try:
+            loop.run_until_complete(main_task)
+        finally:
+            loop.close()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Check two versions of ruff against a corpus of open-source code.",
+    )
+
+    # TODO: Support non-default `--targets`
+    # parser.add_argument(
+    #     "--targets",
+    #     type=Path,
+    #     help=(
+    #         "Optional JSON files to use over the default repositories. "
+    #         "Supports both github_search_*.jsonl and known-github-tomls.jsonl."
+    #     ),
+    # )
+    parser.add_argument(
+        "--cache",
+        type=Path,
+        help="Location for caching cloned repositories",
+    )
+    parser.add_argument(
+        "--output-format",
+        choices=[option.name for option in OutputFormat],
+        default="json",
+        help="Location for caching cloned repositories",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    parser.add_argument(
+        "--pdb",
+        action="store_true",
+        help="Enable debugging on failure",
+    )
+    parser.add_argument(
+        "ruff_command",
+        choices=[option.name for option in RuffCommand],
+        help="The Ruff command to test",
+    )
+    parser.add_argument(
+        "ruff_baseline",
+        type=Path,
+    )
+    parser.add_argument(
+        "ruff_comparison",
+        type=Path,
+    )
+
+    return parser.parse_args()
+
+
+def get_executable_path(name: str) -> Path | None:
+    # Add suffix for Windows executables
+    name += ".exe" if sys.platform == "win32" and not name.endswith(".exe") else ""
+
+    path = os.path.join(sysconfig.get_path("scripts"), name)
+
+    # The executable in the current interpreter's scripts directory.
+    if os.path.exists(path):
+        return Path(path)
+
+    # The executable in the global environment.
+    environment_path = shutil.which(name)
+    if environment_path:
+        return Path(environment_path)
+
+    return None