cpython/Doc/tools/check-warnings.py

#!/usr/bin/env python3
"""
Check the output of running Sphinx in nit-picky mode (missing references).
"""
from __future__ import annotations

import argparse
import itertools
import os
import re
import subprocess
import sys
from pathlib import Path
from typing import TextIO

# Exclude these whether they're dirty or clean,
# because they trigger a rebuild of dirty files.
EXCLUDE_FILES = {
    "Doc/whatsnew/changelog.rst",
}

# Subdirectories of Doc/ to exclude.
EXCLUDE_SUBDIRS = {
    ".env",
    ".venv",
    "env",
    "includes",
    "venv",
}

# Regex pattern to match the parts of a Sphinx warning
WARNING_PATTERN = re.compile(
    r"(?P<file>([A-Za-z]:[\\/])?[^:]+):(?P<line>\d+): WARNING: (?P<msg>.+)"
)

# Regex pattern to match the line numbers in a Git unified diff
DIFF_PATTERN = re.compile(
    r"^@@ -(?P<linea>\d+)(?:,(?P<removed>\d+))? \+(?P<lineb>\d+)(?:,(?P<added>\d+))? @@",
    flags=re.MULTILINE,
)


def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]:
    """List the files changed between two Git refs, filtered by change type."""
    added_files_result = subprocess.run(
        [
            "git",
            "diff",
            f"--diff-filter={filter_mode}",
            "--name-only",
            f"{ref_a}...{ref_b}",
            "--",
        ],
        stdout=subprocess.PIPE,
        check=True,
        text=True,
        encoding="UTF-8",
    )

    added_files = added_files_result.stdout.strip().split("\n")
    return {Path(file.strip()) for file in added_files if file.strip()}


def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]:
    """List the lines changed between two Git refs for a specific file."""
    diff_output = subprocess.run(
        [
            "git",
            "diff",
            "--unified=0",
            f"{ref_a}...{ref_b}",
            "--",
            str(file),
        ],
        stdout=subprocess.PIPE,
        check=True,
        text=True,
        encoding="UTF-8",
    )

    # Scrape line offsets + lengths from diff and convert to line numbers
    line_matches = DIFF_PATTERN.finditer(diff_output.stdout)
    # Removed and added line counts are 1 if not printed
    line_match_values = [
        line_match.groupdict(default=1) for line_match in line_matches
    ]
    line_ints = [
        (int(match_value["lineb"]), int(match_value["added"]))
        for match_value in line_match_values
    ]
    line_ranges = [
        range(line_b, line_b + added) for line_b, added in line_ints
    ]
    line_numbers = list(itertools.chain(*line_ranges))

    return line_numbers


def get_para_line_numbers(file_obj: TextIO) -> list[list[int]]:
    """Get the line numbers of text in a file object, grouped by paragraph."""
    paragraphs = []
    prev_line = None
    for lineno, line in enumerate(file_obj):
        lineno = lineno + 1
        if prev_line is None or (line.strip() and not prev_line.strip()):
            paragraph = [lineno - 1]
            paragraphs.append(paragraph)
        paragraph.append(lineno)
        prev_line = line
    return paragraphs


def filter_and_parse_warnings(
    warnings: list[str], files: set[Path]
) -> list[re.Match[str]]:
    """Get the warnings matching passed files and parse them with regex."""
    filtered_warnings = [
        warning
        for warning in warnings
        if any(str(file) in warning for file in files)
    ]
    warning_matches = [
        WARNING_PATTERN.fullmatch(warning.strip())
        for warning in filtered_warnings
    ]
    non_null_matches = [warning for warning in warning_matches if warning]
    return non_null_matches


def filter_warnings_by_diff(
    warnings: list[re.Match[str]], ref_a: str, ref_b: str, file: Path
) -> list[re.Match[str]]:
    """Filter the passed per-file warnings to just those on changed lines."""
    diff_lines = get_diff_lines(ref_a, ref_b, file)
    with file.open(encoding="UTF-8") as file_obj:
        paragraphs = get_para_line_numbers(file_obj)
    touched_paras = [
        para_lines
        for para_lines in paragraphs
        if set(diff_lines) & set(para_lines)
    ]
    touched_para_lines = set(itertools.chain(*touched_paras))
    warnings_infile = [
        warning for warning in warnings if str(file) in warning["file"]
    ]
    warnings_touched = [
        warning
        for warning in warnings_infile
        if int(warning["line"]) in touched_para_lines
    ]
    return warnings_touched


def process_touched_warnings(
    warnings: list[str], ref_a: str, ref_b: str
) -> list[re.Match[str]]:
    """Filter a list of Sphinx warnings to those affecting touched lines."""
    added_files, modified_files = tuple(
        get_diff_files(ref_a, ref_b, filter_mode=mode) for mode in ("A", "M")
    )

    warnings_added = filter_and_parse_warnings(warnings, added_files)
    warnings_modified = filter_and_parse_warnings(warnings, modified_files)

    modified_files_warned = {
        file
        for file in modified_files
        if any(str(file) in warning["file"] for warning in warnings_modified)
    }

    warnings_modified_touched = [
        filter_warnings_by_diff(warnings_modified, ref_a, ref_b, file)
        for file in modified_files_warned
    ]
    warnings_touched = warnings_added + list(
        itertools.chain(*warnings_modified_touched)
    )

    return warnings_touched


def annotate_diff(
    warnings: list[str], ref_a: str = "main", ref_b: str = "HEAD"
) -> None:
    """
    Convert Sphinx warning messages to GitHub Actions for changed paragraphs.

    Converts lines like:
        .../Doc/library/cgi.rst:98: WARNING: reference target not found
    to:
        ::warning file=.../Doc/library/cgi.rst,line=98::reference target not found

    See:
    https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-warning-message
    """
    warnings_touched = process_touched_warnings(warnings, ref_a, ref_b)
    print("Emitting doc warnings matching modified lines:")
    for warning in warnings_touched:
        print("::warning file={file},line={line}::{msg}".format_map(warning))
        print(warning[0])
    if not warnings_touched:
        print("None")


def fail_if_regression(
    warnings: list[str], files_with_expected_nits: set[str], files_with_nits: set[str]
) -> int:
    """
    Ensure some files always pass Sphinx nit-picky mode (no missing references).
    These are files which are *not* in .nitignore.
    """
    all_rst = {
        str(rst)
        for rst in Path("Doc/").rglob("*.rst")
        if rst.parts[1] not in EXCLUDE_SUBDIRS
    }
    should_be_clean = all_rst - files_with_expected_nits - EXCLUDE_FILES
    problem_files = sorted(should_be_clean & files_with_nits)
    if problem_files:
        print("\nError: must not contain warnings:\n")
        for filename in problem_files:
            print(filename)
            for warning in warnings:
                if filename in warning:
                    if match := WARNING_PATTERN.fullmatch(warning):
                        print("  {line}: {msg}".format_map(match))
        return -1
    return 0


def fail_if_improved(
    files_with_expected_nits: set[str], files_with_nits: set[str]
) -> int:
    """
    We may have fixed warnings in some files so that the files are now completely clean.
    Good news! Let's add them to .nitignore to prevent regression.
    """
    files_with_no_nits = files_with_expected_nits - files_with_nits
    if files_with_no_nits:
        print("\nCongratulations! You improved:\n")
        for filename in sorted(files_with_no_nits):
            print(filename)
        print("\nPlease remove from Doc/tools/.nitignore\n")
        return -1
    return 0


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--annotate-diff",
        nargs="*",
        metavar=("BASE_REF", "HEAD_REF"),
        help="Add GitHub Actions annotations on the diff for warnings on "
        "lines changed between the given refs (main and HEAD, by default)",
    )
    parser.add_argument(
        "--fail-if-regression",
        action="store_true",
        help="Fail if known-good files have warnings",
    )
    parser.add_argument(
        "--fail-if-improved",
        action="store_true",
        help="Fail if new files with no nits are found",
    )

    args = parser.parse_args(argv)
    if args.annotate_diff is not None and len(args.annotate_diff) > 2:
        parser.error(
            "--annotate-diff takes between 0 and 2 ref args, not "
            f"{len(args.annotate_diff)} {tuple(args.annotate_diff)}"
        )
    exit_code = 0

    wrong_directory_msg = "Must run this script from the repo root"
    assert Path("Doc").exists() and Path("Doc").is_dir(), wrong_directory_msg

    with Path("Doc/sphinx-warnings.txt").open(encoding="UTF-8") as f:
        warnings = f.read().splitlines()

    cwd = str(Path.cwd()) + os.path.sep
    files_with_nits = {
        warning.removeprefix(cwd).split(":")[0]
        for warning in warnings
        if "Doc/" in warning
    }

    with Path("Doc/tools/.nitignore").open(encoding="UTF-8") as clean_files:
        files_with_expected_nits = {
            filename.strip()
            for filename in clean_files
            if filename.strip() and not filename.startswith("#")
        }

    if args.annotate_diff is not None:
        annotate_diff(warnings, *args.annotate_diff)

    if args.fail_if_regression:
        exit_code += fail_if_regression(
            warnings, files_with_expected_nits, files_with_nits
        )

    if args.fail_if_improved:
        exit_code += fail_if_improved(files_with_expected_nits, files_with_nits)

    return exit_code


if __name__ == "__main__":
    sys.exit(main())