"""Generate the confusables.rs file from the VS Code ambiguous.json file.""" from __future__ import annotations import json import subprocess from pathlib import Path CONFUSABLES_RS_PATH = "crates/ruff_linter/src/rules/ruff/rules/confusables.rs" AMBIGUOUS_JSON_URL = "https://raw.githubusercontent.com/hediet/vscode-unicode-data/main/out/ambiguous.json" prelude = """ //! This file is auto-generated by `scripts/update_ambiguous_characters.py`. /// Via: /// See: pub(crate) fn confusable(c: u32) -> Option { let result = match c { """.lstrip() postlude = """_ => return None, }; Some(result)}""" def get_mapping_data() -> dict: """ Get the ambiguous character mapping data from the vscode-unicode-data repository. Uses the system's `curl` command to download the data, instead of adding a dependency to a Python-native HTTP client. """ content = subprocess.check_output( ["curl", "-sSL", AMBIGUOUS_JSON_URL], encoding="utf-8", ) # The content is a JSON object literal wrapped in a JSON string, so double decode: return json.loads(json.loads(content)) def format_number(number: int) -> str: """Underscore-separate the digits of a number.""" # For unknown historical reasons, numbers greater than 100,000 were # underscore-delimited in the generated file, so we now preserve that property to # avoid unnecessary churn. if number > 100000: number = str(number) number = "_".join(number[i : i + 3] for i in range(0, len(number), 3)) return f"{number}_u32" return f"{number}u32" def format_char(number: int) -> str: """Format a Python integer as a Rust character literal.""" char = chr(number) if char == "\\": return "\\\\" return char def format_confusables_rs(raw_data: dict[str, list[int]]) -> str: """Format the downloaded data into a Rust source file.""" # The input data contains duplicate entries. flattened_items: set[tuple[int, int]] = set() for _category, items in raw_data.items(): assert len(items) % 2 == 0, "Expected pairs of items" for i in range(0, len(items), 2): flattened_items.add((items[i], items[i + 1])) tuples = [ f" {format_number(left)} => '{format_char(right)}',\n" for left, right in sorted(flattened_items) ] # Add some additional confusable pairs that are not included in the VS Code data, # as they're unicode-to-unicode confusables, not unicode-to-ASCII confusables. confusable_units = [ # ANGSTROM SIGN → LATIN CAPITAL LETTER A WITH RING ABOVE ("0x212B", chr(0x00C5)), # OHM SIGN → GREEK CAPITAL LETTER OMEGA ("0x2126", chr(0x03A9)), # MICRO SIGN → GREEK SMALL LETTER MU ("0x00B5", chr(0x03BC)), ] tuples += [f" {left} => '{right}',\n" for left, right in confusable_units] print(f"{len(tuples)} confusable tuples.") return prelude + "".join(tuples) + postlude def main() -> None: print("Retrieving data...") mapping_data = get_mapping_data() formatted_data = format_confusables_rs(mapping_data) confusables_path = Path(__file__).parent.parent / CONFUSABLES_RS_PATH confusables_path.write_text(formatted_data, encoding="utf-8") print("Formatting Rust file with cargo fmt...") subprocess.check_call(["cargo", "fmt", "--", confusables_path]) print("Done.") if __name__ == "__main__": main()