Rewrite ecosystem checks and add ruff format reports (#8223)

Closes #7239 - Refactors `scripts/check_ecosystem.py` into a new Python project at `python/ruff-ecosystem` - Includes [documentation](https://github.com/astral-sh/ruff/blob/zanie/ecosystem-format/python/ruff-ecosystem/README.md) now - Provides a `ruff-ecosystem` CLI - Fixes bug where `ruff check` report included "fixable" summary line - Adds truncation to `ruff check` reports - Otherwise we often won't see the `ruff format` reports - The truncation uses some very simple heuristics and could be improved in the future - Identifies diagnostic changes that occur just because a violation's fix available changes - We still show the diff for the line because it's could matter _where_ this changes, but we could improve this - Similarly, we could improve detection of diagnostic changes where just the message changes - Adds support for JSON ecosystem check output - I added this primarily for development purposes - If there are no changes, only errors while processing projects, we display a different summary message - When caching repositories, we now checkout the requested ref - Adds `ruff format` reports, which format with the baseline then the use `format --diff` to generate a report - Runs all CI jobs when the CI workflow is changed ## Known problems - Since we must format the project to get a baseline, the permalink line numbers do not exactly correspond to the correct range - This looks... hard. I tried using `git diff` and some wonky hunk matching to recover the original line numbers but it doesn't seem worth it. I think we should probably commit the formatted changes to a fork or something if we want great results here. Consequently, I've just used the start line instead of a range for now. - I don't love the comment structure — it'd be nice, perhaps, to have separate headings for the linter and formatter. - However, the `pr-comment` workflow is an absolute pain to change because it runs _separately_ from this pull request so I if I want to make edits to it I can only test it via manual workflow dispatch. - Lines are not printed "as we go" which means they're all held in memory, presumably this would be a problem for large-scale ecosystem checks - We are encountering a hard limit with the maximum comment length supported by GitHub. We will need to move the bulk of the report elsewhere. ## Future work - Update `ruff-ecosystem` to support non-default projects and `check_ecosystem_all.py` behavior - Remove existing ecosystem check scripts - Add preview mode toggle (#8076) - Add a toggle for truncation - Add hints for quick reproduction of runs locally - Consider parsing JSON output of Ruff instead of using regex to parse the text output - Links to project repositories should use the commit hash we checked against - When caching repositories, we should pull the latest changes for the ref - Sort check diffs by path and rule code only (changes in messages should not change order) - Update check diffs to distinguish between new violations and changes in messages - Add "fix" diffs - Remove existing formatter similarity reports - On release pull request, compare to the previous tag instead --------- Co-authored-by: konsti <konstin@mailbox.org>
2025-07-24 05:25:17 +00:00 · 2023-10-27 17:28:01 -05:00 · 2023-10-27 17:28:01 -05:00 · fc94857a20
commit fc94857a20
parent 5f26411577
14 changed files with 1555 additions and 8 deletions
--- a/python/ruff-ecosystem/ruff_ecosystem/format.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/format.py
@ -0,0 +1,195 @@
+"""
+Execution, comparison, and summary of `ruff format` ecosystem checks.
+"""
+
+from __future__ import annotations
+
+import time
+from asyncio import create_subprocess_exec
+from dataclasses import dataclass
+from pathlib import Path
+from subprocess import PIPE
+from typing import TYPE_CHECKING, Sequence
+
+from unidiff import PatchSet
+
+from ruff_ecosystem import logger
+from ruff_ecosystem.markdown import markdown_project_section
+from ruff_ecosystem.types import Comparison, Diff, Result, RuffError
+
+if TYPE_CHECKING:
+    from ruff_ecosystem.projects import ClonedRepository
+
+
+def markdown_format_result(result: Result) -> str:
+    """
+    Render a `ruff format` ecosystem check result as markdown.
+    """
+    lines = []
+    total_lines_removed = total_lines_added = 0
+    total_files_modified = 0
+    error_count = len(result.errored)
+    patch_sets = []
+
+    for project, comparison in result.completed:
+        total_lines_added += comparison.diff.lines_added
+        total_lines_removed += comparison.diff.lines_removed
+
+        patch_set = PatchSet("\n".join(comparison.diff.lines))
+        patch_sets.append(patch_set)
+        total_files_modified += len(patch_set.modified_files)
+
+    if total_lines_removed == 0 and total_lines_added == 0 and error_count == 0:
+        return "\u2705 ecosystem check detected no format changes."
+
+    # Summarize the total changes
+    if total_lines_added == 0 and total_lines_added == 0:
+        # Only errors
+        s = "s" if error_count != 1 else ""
+        lines.append(
+            f"\u2139\ufe0f ecosystem check **encountered format errors**. (no format changes; {error_count} project error{s})"
+        )
+    else:
+        s = "s" if total_files_modified != 1 else ""
+        changes = f"+{total_lines_added} -{total_lines_removed} lines in {total_files_modified} file{s} in {len(result.completed)} projects"
+        if error_count:
+            s = "s" if error_count != 1 else ""
+            changes += f"; {error_count} project error{s}"
+
+        lines.append(
+            f"\u2139\ufe0f ecosystem check **detected format changes**. ({changes})"
+        )
+
+    lines.append("")
+
+    # Then per-project changes
+    for (project, comparison), patch_set in zip(result.completed, patch_sets):
+        if not comparison.diff:
+            continue  # Skip empty diffs
+
+        files = len(patch_set.modified_files)
+        s = "s" if files != 1 else ""
+        title = f"+{comparison.diff.lines_added} -{comparison.diff.lines_removed} lines across {files} file{s}"
+
+        lines.extend(
+            markdown_project_section(
+                title=title,
+                content=format_patchset(patch_set, comparison.repo),
+                options=project.format_options,
+                project=project,
+            )
+        )
+
+    for project, error in result.errored:
+        lines.extend(
+            markdown_project_section(
+                title="error",
+                content=f"```\n{error}```",
+                options=project.format_options,
+                project=project,
+            )
+        )
+
+    return "\n".join(lines)
+
+
+def format_patchset(patch_set: PatchSet, repo: ClonedRepository) -> str:
+    """
+    Convert a patchset to markdown, adding permalinks to the start of each hunk.
+    """
+    lines = []
+    for file_patch in patch_set:
+        for hunk in file_patch:
+            # Note:  When used for `format` checks, the line number is not exact because
+            #        we formatted the repository for a baseline; we can't know the exact
+            #        line number in the original
+            #        source file.
+            hunk_link = repo.url_for(file_patch.path, hunk.source_start)
+            hunk_lines = str(hunk).splitlines()
+
+            # Add a link before the hunk
+            link_title = file_patch.path + "~L" + str(hunk.source_start)
+            lines.append(f"<a href='{hunk_link}'>{link_title}</a>")
+
+            # Wrap the contents of the hunk in a diff code block
+            lines.append("```diff")
+            lines.extend(hunk_lines[1:])
+            lines.append("```")
+
+    return "\n".join(lines)
+
+
+async def compare_format(
+    ruff_baseline_executable: Path,
+    ruff_comparison_executable: Path,
+    options: FormatOptions,
+    cloned_repo: ClonedRepository,
+):
+    # Run format without diff to get the baseline
+    await ruff_format(
+        executable=ruff_baseline_executable.resolve(),
+        path=cloned_repo.path,
+        name=cloned_repo.fullname,
+        options=options,
+    )
+    # Then get the diff from stdout
+    diff = await ruff_format(
+        executable=ruff_comparison_executable.resolve(),
+        path=cloned_repo.path,
+        name=cloned_repo.fullname,
+        options=options,
+        diff=True,
+    )
+
+    return Comparison(diff=Diff(diff), repo=cloned_repo)
+
+
+async def ruff_format(
+    *,
+    executable: Path,
+    path: Path,
+    name: str,
+    options: FormatOptions,
+    diff: bool = False,
+) -> Sequence[str]:
+    """Run the given ruff binary against the specified path."""
+    logger.debug(f"Formatting {name} with {executable}")
+    ruff_args = options.to_cli_args()
+
+    if diff:
+        ruff_args.append("--diff")
+
+    start = time.time()
+    proc = await create_subprocess_exec(
+        executable.absolute(),
+        *ruff_args,
+        ".",
+        stdout=PIPE,
+        stderr=PIPE,
+        cwd=path,
+    )
+    result, err = await proc.communicate()
+    end = time.time()
+
+    logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}s")
+
+    if proc.returncode not in [0, 1]:
+        raise RuffError(err.decode("utf8"))
+
+    lines = result.decode("utf8").splitlines()
+    return lines
+
+
+@dataclass(frozen=True)
+class FormatOptions:
+    """
+    Ruff format options.
+    """
+
+    exclude: str = ""
+
+    def to_cli_args(self) -> list[str]:
+        args = ["format"]
+        if self.exclude:
+            args.extend(["--exclude", self.exclude])
+        return args