mirror of
https://github.com/astral-sh/ruff.git
synced 2025-10-24 01:02:44 +00:00
![renovate[bot]](/assets/img/avatar_default.png)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Zanie Blue <contact@zanie.dev> Co-authored-by: Alex Waygood <alex.waygood@gmail.com>
104 lines
3.6 KiB
Python
104 lines
3.6 KiB
Python
"""Generate the confusables.rs file from the VS Code ambiguous.json file."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
CONFUSABLES_RS_PATH = "crates/ruff_linter/src/rules/ruff/rules/confusables.rs"
|
|
AMBIGUOUS_JSON_URL = "https://raw.githubusercontent.com/hediet/vscode-unicode-data/main/out/ambiguous.json"
|
|
|
|
prelude = """
|
|
//! This file is auto-generated by `scripts/update_ambiguous_characters.py`.
|
|
|
|
/// Via: <https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json>
|
|
/// See: <https://github.com/microsoft/vscode/blob/095ddabc52b82498ee7f718a34f9dd11d59099a8/src/vs/base/common/strings.ts#L1094>
|
|
pub(crate) fn confusable(c: u32) -> Option<char> {
|
|
let result = match c {
|
|
|
|
""".lstrip()
|
|
|
|
postlude = """_ => return None, }; Some(result)}"""
|
|
|
|
|
|
def get_mapping_data() -> dict:
|
|
"""
|
|
Get the ambiguous character mapping data from the vscode-unicode-data repository.
|
|
|
|
Uses the system's `curl` command to download the data,
|
|
instead of adding a dependency to a Python-native HTTP client.
|
|
"""
|
|
content = subprocess.check_output(
|
|
["curl", "-sSL", AMBIGUOUS_JSON_URL],
|
|
encoding="utf-8",
|
|
)
|
|
# The content is a JSON object literal wrapped in a JSON string, so double decode:
|
|
return json.loads(json.loads(content))
|
|
|
|
|
|
def format_number(number: int) -> str:
|
|
"""Underscore-separate the digits of a number."""
|
|
# For unknown historical reasons, numbers greater than 100,000 were
|
|
# underscore-delimited in the generated file, so we now preserve that property to
|
|
# avoid unnecessary churn.
|
|
if number > 100000:
|
|
number = str(number)
|
|
number = "_".join(number[i : i + 3] for i in range(0, len(number), 3))
|
|
return f"{number}_u32"
|
|
|
|
return f"{number}u32"
|
|
|
|
|
|
def format_char(number: int) -> str:
|
|
"""Format a Python integer as a Rust character literal."""
|
|
char = chr(number)
|
|
if char == "\\":
|
|
return "\\\\"
|
|
return char
|
|
|
|
|
|
def format_confusables_rs(raw_data: dict[str, list[int]]) -> str:
|
|
"""Format the downloaded data into a Rust source file."""
|
|
# The input data contains duplicate entries.
|
|
flattened_items: set[tuple[int, int]] = set()
|
|
for _category, items in raw_data.items():
|
|
assert len(items) % 2 == 0, "Expected pairs of items"
|
|
for i in range(0, len(items), 2):
|
|
flattened_items.add((items[i], items[i + 1]))
|
|
|
|
tuples = [
|
|
f" {format_number(left)} => '{format_char(right)}',\n"
|
|
for left, right in sorted(flattened_items)
|
|
]
|
|
|
|
# Add some additional confusable pairs that are not included in the VS Code data,
|
|
# as they're unicode-to-unicode confusables, not unicode-to-ASCII confusables.
|
|
confusable_units = [
|
|
# ANGSTROM SIGN → LATIN CAPITAL LETTER A WITH RING ABOVE
|
|
("0x212B", chr(0x00C5)),
|
|
# OHM SIGN → GREEK CAPITAL LETTER OMEGA
|
|
("0x2126", chr(0x03A9)),
|
|
# MICRO SIGN → GREEK SMALL LETTER MU
|
|
("0x00B5", chr(0x03BC)),
|
|
]
|
|
tuples += [f" {left} => '{right}',\n" for left, right in confusable_units]
|
|
|
|
print(f"{len(tuples)} confusable tuples.")
|
|
|
|
return prelude + "".join(tuples) + postlude
|
|
|
|
|
|
def main() -> None:
|
|
print("Retrieving data...")
|
|
mapping_data = get_mapping_data()
|
|
formatted_data = format_confusables_rs(mapping_data)
|
|
confusables_path = Path(__file__).parent.parent / CONFUSABLES_RS_PATH
|
|
confusables_path.write_text(formatted_data, encoding="utf-8")
|
|
print("Formatting Rust file with cargo fmt...")
|
|
subprocess.check_call(["cargo", "fmt", "--", confusables_path])
|
|
print("Done.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|