mirror of
				https://github.com/astral-sh/ruff.git
				synced 2025-10-31 03:55:09 +00:00 
			
		
		
		
	![renovate[bot]](/assets/img/avatar_default.png) 388658efdb
			
		
	
	
		388658efdb
		
			
		
	
	
	
	
		
			
			Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Zanie Blue <contact@zanie.dev> Co-authored-by: Alex Waygood <alex.waygood@gmail.com>
		
			
				
	
	
		
			104 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			104 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Generate the confusables.rs file from the VS Code ambiguous.json file."""
 | |
| 
 | |
| from __future__ import annotations
 | |
| 
 | |
| import json
 | |
| import subprocess
 | |
| from pathlib import Path
 | |
| 
 | |
| CONFUSABLES_RS_PATH = "crates/ruff_linter/src/rules/ruff/rules/confusables.rs"
 | |
| AMBIGUOUS_JSON_URL = "https://raw.githubusercontent.com/hediet/vscode-unicode-data/main/out/ambiguous.json"
 | |
| 
 | |
| prelude = """
 | |
| //! This file is auto-generated by `scripts/update_ambiguous_characters.py`.
 | |
| 
 | |
| /// Via: <https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json>
 | |
| /// See: <https://github.com/microsoft/vscode/blob/095ddabc52b82498ee7f718a34f9dd11d59099a8/src/vs/base/common/strings.ts#L1094>
 | |
| pub(crate) fn confusable(c: u32) -> Option<char> {
 | |
|   let result = match c {
 | |
| 
 | |
| """.lstrip()
 | |
| 
 | |
| postlude = """_ => return None, }; Some(result)}"""
 | |
| 
 | |
| 
 | |
| def get_mapping_data() -> dict:
 | |
|     """
 | |
|     Get the ambiguous character mapping data from the vscode-unicode-data repository.
 | |
| 
 | |
|     Uses the system's `curl` command to download the data,
 | |
|     instead of adding a dependency to a Python-native HTTP client.
 | |
|     """
 | |
|     content = subprocess.check_output(
 | |
|         ["curl", "-sSL", AMBIGUOUS_JSON_URL],
 | |
|         encoding="utf-8",
 | |
|     )
 | |
|     # The content is a JSON object literal wrapped in a JSON string, so double decode:
 | |
|     return json.loads(json.loads(content))
 | |
| 
 | |
| 
 | |
| def format_number(number: int) -> str:
 | |
|     """Underscore-separate the digits of a number."""
 | |
|     # For unknown historical reasons, numbers greater than 100,000 were
 | |
|     # underscore-delimited in the generated file, so we now preserve that property to
 | |
|     # avoid unnecessary churn.
 | |
|     if number > 100000:
 | |
|         number = str(number)
 | |
|         number = "_".join(number[i : i + 3] for i in range(0, len(number), 3))
 | |
|         return f"{number}_u32"
 | |
| 
 | |
|     return f"{number}u32"
 | |
| 
 | |
| 
 | |
| def format_char(number: int) -> str:
 | |
|     """Format a Python integer as a Rust character literal."""
 | |
|     char = chr(number)
 | |
|     if char == "\\":
 | |
|         return "\\\\"
 | |
|     return char
 | |
| 
 | |
| 
 | |
| def format_confusables_rs(raw_data: dict[str, list[int]]) -> str:
 | |
|     """Format the downloaded data into a Rust source file."""
 | |
|     # The input data contains duplicate entries.
 | |
|     flattened_items: set[tuple[int, int]] = set()
 | |
|     for _category, items in raw_data.items():
 | |
|         assert len(items) % 2 == 0, "Expected pairs of items"
 | |
|         for i in range(0, len(items), 2):
 | |
|             flattened_items.add((items[i], items[i + 1]))
 | |
| 
 | |
|     tuples = [
 | |
|         f"    {format_number(left)} => '{format_char(right)}',\n"
 | |
|         for left, right in sorted(flattened_items)
 | |
|     ]
 | |
| 
 | |
|     # Add some additional confusable pairs that are not included in the VS Code data,
 | |
|     # as they're unicode-to-unicode confusables, not unicode-to-ASCII confusables.
 | |
|     confusable_units = [
 | |
|         # ANGSTROM SIGN → LATIN CAPITAL LETTER A WITH RING ABOVE
 | |
|         ("0x212B", chr(0x00C5)),
 | |
|         # OHM SIGN → GREEK CAPITAL LETTER OMEGA
 | |
|         ("0x2126", chr(0x03A9)),
 | |
|         # MICRO SIGN → GREEK SMALL LETTER MU
 | |
|         ("0x00B5", chr(0x03BC)),
 | |
|     ]
 | |
|     tuples += [f"    {left} => '{right}',\n" for left, right in confusable_units]
 | |
| 
 | |
|     print(f"{len(tuples)} confusable tuples.")
 | |
| 
 | |
|     return prelude + "".join(tuples) + postlude
 | |
| 
 | |
| 
 | |
| def main() -> None:
 | |
|     print("Retrieving data...")
 | |
|     mapping_data = get_mapping_data()
 | |
|     formatted_data = format_confusables_rs(mapping_data)
 | |
|     confusables_path = Path(__file__).parent.parent / CONFUSABLES_RS_PATH
 | |
|     confusables_path.write_text(formatted_data, encoding="utf-8")
 | |
|     print("Formatting Rust file with cargo fmt...")
 | |
|     subprocess.check_call(["cargo", "fmt", "--", confusables_path])
 | |
|     print("Done.")
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 |