Check for backtick-quoted shortcut links in CI (#16114)

## Summary

Follow-up to #16035.

`check_docs_formatted.py` will now report backtick-quoted shortcut links
in rule documentation. It uses a regular expression to find them. Such a
link:

* Starts with `[`, followed by <code>\`</code>, then a "name" sequence
of at least one non-backtick non-newline character, followed by another
<code>\`</code>, then ends with `]`.
* Is not followed by either a `[` or a `(`.
* Is not placed within a code block.

If the name is a known Ruff option name, that link is not considered a
violation.

## Test Plan

Manual.
This commit is contained in:
InSync 2025-02-14 14:37:46 +07:00 committed by GitHub
parent 81e202ed52
commit 1db8392a5a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4,6 +4,7 @@
from __future__ import annotations
import argparse
import json
import os
import re
import subprocess
@ -16,12 +17,26 @@ if TYPE_CHECKING:
from collections.abc import Sequence
SNIPPED_RE = re.compile(
r"(?P<before>^(?P<indent> *)```(?:\s*(?P<language>\w+))?\n)"
r"(?P<before>^(?P<indent>\x20*)```(?:\s*(?P<language>\w+))?\n)"
r"(?P<code>.*?)"
r"(?P<after>^(?P=indent)```\s*$)",
re.DOTALL | re.MULTILINE,
)
# Long explanation: https://www.rexegg.com/regex-best-trick.html
#
# Short explanation:
# Match both code blocks and shortcut links, then discard the former.
# Whatever matched by the second branch is guaranteed to never be
# part of a code block, as that would already be caught by the first.
BACKTICKED_SHORTCUT_LINK_RE = re.compile(
rf"""(?msx)
(?:{SNIPPED_RE}
| \[`(?P<name>[^`\n]+)`](?![\[(])
)
"""
)
# For some rules, we don't want Ruff to fix the formatting as this would "fix" the
# example.
KNOWN_FORMATTING_VIOLATIONS = [
@ -238,6 +253,28 @@ def format_file(file: Path, error_known: bool, args: argparse.Namespace) -> int:
return 0
def find_backticked_shortcut_links(
path: Path, all_config_names: dict[str, object]
) -> set[str]:
"""Check for links of the form: [`foobar`].
See explanation at #16010.
"""
with path.open() as file:
contents = file.read()
broken_link_names: set[str] = set()
for match in BACKTICKED_SHORTCUT_LINK_RE.finditer(contents):
name = match["name"]
if name is not None and name not in all_config_names:
broken_link_names.add(name)
return broken_link_names
def main(argv: Sequence[str] | None = None) -> int:
"""Check code snippets in docs are formatted by Ruff."""
parser = argparse.ArgumentParser(
@ -291,8 +328,14 @@ def main(argv: Sequence[str] | None = None) -> int:
print("Please remove them and re-run.")
return 1
ruff_config_output = subprocess.check_output(
["ruff", "config", "--output-format", "json"], encoding="utf-8"
)
all_config_names = json.loads(ruff_config_output)
violations = 0
errors = 0
broken_links: dict[str, set[str]] = {}
print("Checking docs formatting...")
for file in [*static_docs, *generated_docs]:
rule_name = file.name.split(".")[0]
@ -307,13 +350,38 @@ def main(argv: Sequence[str] | None = None) -> int:
elif result == 2 and not error_known:
errors += 1
broken_links_in_file = find_backticked_shortcut_links(file, all_config_names)
if broken_links_in_file:
broken_links[file.name] = broken_links_in_file
if violations > 0:
print(f"Formatting violations identified: {violations}")
if errors > 0:
print(f"New code block parse errors identified: {errors}")
if violations > 0 or errors > 0:
if broken_links:
print()
print("Do not use backticked shortcut links: [`foobar`]")
print(
"They work with Mkdocs but cannot be rendered by CommonMark and GFM-compliant implementers."
)
print("Instead, use an explicit label:")
print("```markdown")
print("[`lorem.ipsum`][lorem-ipsum]")
print()
print("[lorem-ipsum]: https://example.com/")
print("```")
print()
print("The following links are found to be broken:")
for filename, link_names in broken_links.items():
print(f"- {filename}:")
print("\n".join(f" - {name}" for name in link_names))
if violations > 0 or errors > 0 or broken_links:
return 1
print("All docs are formatted correctly.")