ruff/scripts/check_docs_formatted.py

#!/usr/bin/env python3
"""Check code snippets in docs are formatted by black."""
import argparse
import os
import re
import textwrap
from collections.abc import Sequence
from pathlib import Path
from re import Match

import black
from black.mode import Mode, TargetVersion
from black.parsing import InvalidInput

TARGET_VERSIONS = ["py37", "py38", "py39", "py310", "py311"]
SNIPPED_RE = re.compile(
    r"(?P<before>^(?P<indent> *)```\s*python\n)"
    r"(?P<code>.*?)"
    r"(?P<after>^(?P=indent)```\s*$)",
    re.DOTALL | re.MULTILINE,
)

# For some rules, we don't want black to fix the formatting as this would "fix" the
# example.
KNOWN_FORMATTING_VIOLATIONS = [
    "avoidable-escaped-quote",
    "bad-quotes-docstring",
    "bad-quotes-inline-string",
    "bad-quotes-multiline-string",
    "explicit-string-concatenation",
    "indentation-with-invalid-multiple",
    "line-too-long",
    "missing-trailing-comma",
    "multi-line-implicit-string-concatenation",
    "multiple-leading-hashes-for-block-comment",
    "multiple-spaces-after-keyword",
    "multiple-spaces-after-operator",
    "multiple-spaces-before-keyword",
    "multiple-spaces-before-operator",
    "multiple-statements-on-one-line-colon",
    "multiple-statements-on-one-line-semicolon",
    "no-indented-block-comment",
    "no-space-after-block-comment",
    "no-space-after-inline-comment",
    "over-indented",
    "prohibited-trailing-comma",
    "shebang-leading-whitespace",
    "too-few-spaces-before-inline-comment",
    "trailing-comma-on-bare-tuple",
    "unexpected-indentation-comment",
    "unnecessary-class-parentheses",
    "useless-semicolon",
    "whitespace-after-open-bracket",
    "whitespace-before-close-bracket",
    "whitespace-before-punctuation",
]

# For some docs, black is unable to parse the example code.
KNOWN_PARSE_ERRORS = [
    "blank-line-with-whitespace",
    "indentation-with-invalid-multiple-comment",
    "missing-newline-at-end-of-file",
    "mixed-spaces-and-tabs",
    "no-indented-block",
    "tab-after-keyword",
    "tab-after-operator",
    "tab-before-keyword",
    "tab-before-operator",
    "trailing-whitespace",
    "unexpected-indentation",
]


class CodeBlockError(Exception):
    """A code block parse error."""


def format_str(
    src: str,
    black_mode: black.FileMode,
) -> tuple[str, Sequence[CodeBlockError]]:
    """Format a single docs file string."""
    errors: list[CodeBlockError] = []

    def _snipped_match(match: Match[str]) -> str:
        code = textwrap.dedent(match["code"])
        try:
            code = black.format_str(code, mode=black_mode)
        except InvalidInput as e:
            errors.append(CodeBlockError(e))

        code = textwrap.indent(code, match["indent"])
        return f'{match["before"]}{code}{match["after"]}'

    src = SNIPPED_RE.sub(_snipped_match, src)
    return src, errors


def format_file(
    file: Path,
    black_mode: black.FileMode,
    error_known: bool,
    args: argparse.Namespace,
) -> int:
    """Check the formatting of a single docs file.

    Returns the exit code for the script.
    """
    with file.open() as f:
        contents = f.read()

    if file.parent.name == "rules":
        # Check contents contains "What it does" section
        if "## What it does" not in contents:
            print(f"Docs for `{file.name}` are missing the `What it does` section.")
            return 1

        # Check contents contains "Why is this bad?" section
        if "## Why is this bad?" not in contents:
            print(f"Docs for `{file.name}` are missing the `Why is this bad?` section.")
            return 1

    # Remove everything before the first example
    contents = contents[contents.find("## Example") :]

    # Remove everything after the last example
    contents = contents[: contents.rfind("```")] + "```"

    new_contents, errors = format_str(contents, black_mode)

    if errors and not args.skip_errors and not error_known:
        for error in errors:
            rule_name = file.name.split(".")[0]
            print(f"Docs parse error for `{rule_name}` docs: {error}")

        return 2

    if contents != new_contents:
        rule_name = file.name.split(".")[0]
        print(
            f"Rule `{rule_name}` docs are not formatted. The example section "
            f"should be rewritten to:",
        )

        # Add indentation so that snipped can be copied directly to docs
        for line in new_contents.splitlines():
            output_line = "///"
            if len(line) > 0:
                output_line = f"{output_line} {line}"

            print(output_line)

        print("\n")

        return 1

    return 0


def main(argv: Sequence[str] | None = None) -> int:
    """Check code snippets in docs are formatted by black."""
    parser = argparse.ArgumentParser(
        description="Check code snippets in docs are formatted by black.",
    )
    parser.add_argument("--skip-errors", action="store_true")
    parser.add_argument("--generate-docs", action="store_true")
    args = parser.parse_args(argv)

    if args.generate_docs:
        # Generate docs
        from generate_mkdocs import main as generate_docs

        generate_docs()

    # Get static docs
    static_docs = []
    for file in os.listdir("docs"):
        if file.endswith(".md"):
            static_docs.append(Path("docs") / file)

    # Check rules generated
    if not Path("docs/rules").exists():
        print("Please generate rules first.")
        return 1

    # Get generated rules
    generated_docs = []
    for file in os.listdir("docs/rules"):
        if file.endswith(".md"):
            generated_docs.append(Path("docs/rules") / file)

    if len(generated_docs) == 0:
        print("Please generate rules first.")
        return 1

    black_mode = Mode(
        target_versions={TargetVersion[val.upper()] for val in TARGET_VERSIONS},
    )

    # Check known formatting violations and parse errors are sorted alphabetically and
    # have no duplicates. This will reduce the diff when adding new violations

    for known_list, file_string in [
        (KNOWN_FORMATTING_VIOLATIONS, "formatting violations"),
        (KNOWN_PARSE_ERRORS, "parse errors"),
    ]:
        if known_list != sorted(known_list):
            print(
                f"Known {file_string} is not sorted alphabetically. Please sort and "
                f"re-run.",
            )
            return 1

        duplicates = list({x for x in known_list if known_list.count(x) > 1})
        if len(duplicates) > 0:
            print(f"Known {file_string} has duplicates:")
            print("\n".join([f"  - {x}" for x in duplicates]))
            print("Please remove them and re-run.")
            return 1

    violations = 0
    errors = 0
    for file in [*static_docs, *generated_docs]:
        rule_name = file.name.split(".")[0]
        if rule_name in KNOWN_FORMATTING_VIOLATIONS:
            continue

        error_known = rule_name in KNOWN_PARSE_ERRORS

        result = format_file(file, black_mode, error_known, args)
        if result == 1:
            violations += 1
        elif result == 2 and not error_known:
            errors += 1

    if violations > 0:
        print(f"Formatting violations identified: {violations}")

    if errors > 0:
        print(f"New code block parse errors identified: {errors}")

    if violations > 0 or errors > 0:
        return 1

    return 0


if __name__ == "__main__":
    raise SystemExit(main())