Rewrite ecosystem checks and add ruff format reports (#8223)

Closes #7239 

- Refactors `scripts/check_ecosystem.py` into a new Python project at
`python/ruff-ecosystem`
- Includes
[documentation](https://github.com/astral-sh/ruff/blob/zanie/ecosystem-format/python/ruff-ecosystem/README.md)
now
    - Provides a `ruff-ecosystem` CLI
- Fixes bug where `ruff check` report included "fixable" summary line
- Adds truncation to `ruff check` reports
    - Otherwise we often won't see the `ruff format` reports
- The truncation uses some very simple heuristics and could be improved
in the future
- Identifies diagnostic changes that occur just because a violation's
fix available changes
- We still show the diff for the line because it's could matter _where_
this changes, but we could improve this
- Similarly, we could improve detection of diagnostic changes where just
the message changes
- Adds support for JSON ecosystem check output
    - I added this primarily for development purposes
- If there are no changes, only errors while processing projects, we
display a different summary message
- When caching repositories, we now checkout the requested ref
- Adds `ruff format` reports, which format with the baseline then the
use `format --diff` to generate a report
- Runs all CI jobs when the CI workflow is changed

## Known problems

- Since we must format the project to get a baseline, the permalink line
numbers do not exactly correspond to the correct range
- This looks... hard. I tried using `git diff` and some wonky hunk
matching to recover the original line numbers but it doesn't seem worth
it. I think we should probably commit the formatted changes to a fork or
something if we want great results here. Consequently, I've just used
the start line instead of a range for now.
- I don't love the comment structure — it'd be nice, perhaps, to have
separate headings for the linter and formatter.
- However, the `pr-comment` workflow is an absolute pain to change
because it runs _separately_ from this pull request so I if I want to
make edits to it I can only test it via manual workflow dispatch.
- Lines are not printed "as we go" which means they're all held in
memory, presumably this would be a problem for large-scale ecosystem
checks
- We are encountering a hard limit with the maximum comment length
supported by GitHub. We will need to move the bulk of the report
elsewhere.

## Future work

- Update `ruff-ecosystem` to support non-default projects and
`check_ecosystem_all.py` behavior
- Remove existing ecosystem check scripts
- Add preview mode toggle (#8076)
- Add a toggle for truncation
- Add hints for quick reproduction of runs locally
- Consider parsing JSON output of Ruff instead of using regex to parse
the text output
- Links to project repositories should use the commit hash we checked
against
- When caching repositories, we should pull the latest changes for the
ref
- Sort check diffs by path and rule code only (changes in messages
should not change order)
- Update check diffs to distinguish between new violations and changes
in messages
- Add "fix" diffs
- Remove existing formatter similarity reports
- On release pull request, compare to the previous tag instead

---------

Co-authored-by: konsti <konstin@mailbox.org>
This commit is contained in:
Zanie Blue 2023-10-27 17:28:01 -05:00 committed by GitHub
parent 5f26411577
commit fc94857a20
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 1555 additions and 8 deletions

View file

@ -0,0 +1,144 @@
import asyncio
import dataclasses
import json
from enum import Enum
from pathlib import Path
from typing import Awaitable, TypeVar
from ruff_ecosystem import logger
from ruff_ecosystem.check import compare_check, markdown_check_result
from ruff_ecosystem.format import compare_format, markdown_format_result
from ruff_ecosystem.projects import (
Project,
RuffCommand,
)
from ruff_ecosystem.types import Comparison, Result, Serializable
T = TypeVar("T")
GITHUB_MAX_COMMENT_LENGTH = 65536
class OutputFormat(Enum):
markdown = "markdown"
json = "json"
async def main(
command: RuffCommand,
ruff_baseline_executable: Path,
ruff_comparison_executable: Path,
targets: list[Project],
project_dir: Path,
format: OutputFormat,
max_parallelism: int = 50,
raise_on_failure: bool = False,
) -> None:
logger.debug("Using command %s", command.value)
logger.debug("Using baseline executable at %s", ruff_baseline_executable)
logger.debug("Using comparison executable at %s", ruff_comparison_executable)
logger.debug("Using checkout_dir directory %s", project_dir)
logger.debug("Checking %s targets", len(targets))
# Limit parallelism to avoid high memory consumption
semaphore = asyncio.Semaphore(max_parallelism)
async def limited_parallelism(coroutine: Awaitable[T]) -> T:
async with semaphore:
return await coroutine
comparisons: list[Exception | Comparison] = await asyncio.gather(
*[
limited_parallelism(
clone_and_compare(
command,
ruff_baseline_executable,
ruff_comparison_executable,
target,
project_dir,
)
)
for target in targets
],
return_exceptions=not raise_on_failure,
)
comparisons_by_target = dict(zip(targets, comparisons, strict=True))
# Split comparisons into errored / completed
errored, completed = [], []
for target, comparison in comparisons_by_target.items():
if isinstance(comparison, Exception):
errored.append((target, comparison))
else:
completed.append((target, comparison))
result = Result(completed=completed, errored=errored)
match format:
case OutputFormat.json:
print(json.dumps(result, indent=4, cls=JSONEncoder))
case OutputFormat.markdown:
match command:
case RuffCommand.check:
print(markdown_check_result(result))
case RuffCommand.format:
print(markdown_format_result(result))
case _:
raise ValueError(f"Unknown target Ruff command {command}")
case _:
raise ValueError(f"Unknown output format {format}")
return None
async def clone_and_compare(
command: RuffCommand,
ruff_baseline_executable: Path,
ruff_comparison_executable: Path,
target: Project,
project_dir: Path,
) -> Comparison:
"""Check a specific repository against two versions of ruff."""
assert ":" not in target.repo.owner
assert ":" not in target.repo.name
match command:
case RuffCommand.check:
compare, options = (
compare_check,
target.check_options,
)
case RuffCommand.format:
compare, options = (
compare_format,
target.format_options,
)
case _:
raise ValueError(f"Unknown target Ruff command {command}")
checkout_dir = project_dir.joinpath(f"{target.repo.owner}:{target.repo.name}")
cloned_repo = await target.repo.clone(checkout_dir)
try:
return await compare(
ruff_baseline_executable,
ruff_comparison_executable,
options,
cloned_repo,
)
except ExceptionGroup as e:
raise e.exceptions[0] from e
class JSONEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, Serializable):
return o.jsonable()
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
if isinstance(o, set):
return tuple(o)
if isinstance(o, Path):
return str(o)
if isinstance(o, Exception):
return str(o)
return super().default(o)