"""Generate the confusables.rs file from the VS Code ambiguous.json file.""" import json import subprocess from pathlib import Path CONFUSABLES_RS_PATH = "crates/ruff/src/rules/ruff/rules/confusables.rs" AMBIGUOUS_JSON_URL = "https://raw.githubusercontent.com/hediet/vscode-unicode-data/main/out/ambiguous.json" prelude = """ /// This file is auto-generated by `scripts/update_ambiguous_characters.py`. use phf::phf_map; /// Via: /// See: #[allow(clippy::unreadable_literal)] pub(crate) static CONFUSABLES: phf::Map = phf_map! { """.lstrip() postlude = """};""" def get_mapping_data() -> dict: """ Get the ambiguous character mapping data from the vscode-unicode-data repository. Uses the system's `curl` command to download the data, instead of adding a dependency to a Python-native HTTP client. """ content = subprocess.check_output( ["curl", "-sSL", AMBIGUOUS_JSON_URL], encoding="utf-8", ) # The content is a JSON object literal wrapped in a JSON string, so double decode: return json.loads(json.loads(content)) def format_confusables_rs(raw_data: dict[str, list[int]]) -> str: """Format the downloaded data into a Rust source file.""" # The input data contains duplicate entries flattened_items: set[tuple[int, int]] = set() for _category, items in raw_data.items(): assert len(items) % 2 == 0, "Expected pairs of items" for i in range(0, len(items), 2): flattened_items.add((items[i], items[i + 1])) tuples = [] for left, right in sorted(flattened_items): tuples.append(f" {left}u32 => {right},\n") print(f"{len(tuples)} confusable tuples.") return prelude + "".join(tuples) + postlude def main() -> None: print("Retrieving data...") mapping_data = get_mapping_data() formatted_data = format_confusables_rs(mapping_data) confusables_path = Path(__file__).parent.parent / CONFUSABLES_RS_PATH confusables_path.write_text(formatted_data, encoding="utf-8") print("Formatting Rust file with cargo fmt...") subprocess.check_call(["cargo", "fmt", "--", confusables_path]) print("Done.") if __name__ == "__main__": main()