mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-04 02:39:12 +00:00

I don't know whether we want to make this change but here's some data... Binary size: - `main`: 30,384 - `charlie/match-phf`: 30,416 llvm-lines: - `main`: 1,784,148 - `charlie/match-phf`: 1,789,877 llvm-lines and binary size are both unchanged (or, by < 5) when moving from `u8` to `u32` return types, and even when moving to `char` keys and values. I didn't expect this, but I'm not very knowledgable on this topic. Performance: ``` Confusables/match/src time: [4.9102 µs 4.9352 µs 4.9777 µs] change: [+1.7469% +2.2421% +2.8710%] (p = 0.00 < 0.05) Performance has regressed. Found 12 outliers among 100 measurements (12.00%) 2 (2.00%) low mild 4 (4.00%) high mild 6 (6.00%) high severe Confusables/match-with-skip/src time: [2.0676 µs 2.0945 µs 2.1317 µs] change: [+0.9384% +1.6000% +2.3920%] (p = 0.00 < 0.05) Change within noise threshold. Found 8 outliers among 100 measurements (8.00%) 3 (3.00%) high mild 5 (5.00%) high severe Confusables/phf/src time: [31.087 µs 31.188 µs 31.305 µs] change: [+1.9262% +2.2188% +2.5496%] (p = 0.00 < 0.05) Performance has regressed. Found 15 outliers among 100 measurements (15.00%) 3 (3.00%) low mild 6 (6.00%) high mild 6 (6.00%) high severe Confusables/phf-with-skip/src time: [2.0470 µs 2.0486 µs 2.0502 µs] change: [-0.3093% -0.1446% +0.0106%] (p = 0.08 > 0.05) No change in performance detected. Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe ``` The `-with-skip` variants add our optimization which first checks whether the character is ASCII. So `match` is way, way faster than PHF, but it tends not to matter since almost all source code is ASCII anyway.
67 lines
2.3 KiB
Python
67 lines
2.3 KiB
Python
"""Generate the confusables.rs file from the VS Code ambiguous.json file."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
CONFUSABLES_RS_PATH = "crates/ruff/src/rules/ruff/rules/confusables.rs"
|
|
AMBIGUOUS_JSON_URL = "https://raw.githubusercontent.com/hediet/vscode-unicode-data/main/out/ambiguous.json"
|
|
|
|
prelude = """
|
|
//! This file is auto-generated by `scripts/update_ambiguous_characters.py`.
|
|
|
|
/// Via: <https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json>
|
|
/// See: <https://github.com/microsoft/vscode/blob/095ddabc52b82498ee7f718a34f9dd11d59099a8/src/vs/base/common/strings.ts#L1094>
|
|
pub(crate) fn confusable(c: u32) -> Option<u8> {
|
|
let result = match c {
|
|
|
|
""".lstrip()
|
|
|
|
postlude = """_ => return None, }; Some(result)}"""
|
|
|
|
|
|
def get_mapping_data() -> dict:
|
|
"""
|
|
Get the ambiguous character mapping data from the vscode-unicode-data repository.
|
|
|
|
Uses the system's `curl` command to download the data,
|
|
instead of adding a dependency to a Python-native HTTP client.
|
|
"""
|
|
content = subprocess.check_output(
|
|
["curl", "-sSL", AMBIGUOUS_JSON_URL],
|
|
encoding="utf-8",
|
|
)
|
|
# The content is a JSON object literal wrapped in a JSON string, so double decode:
|
|
return json.loads(json.loads(content))
|
|
|
|
|
|
def format_confusables_rs(raw_data: dict[str, list[int]]) -> str:
|
|
"""Format the downloaded data into a Rust source file."""
|
|
# The input data contains duplicate entries
|
|
flattened_items: set[tuple[int, int]] = set()
|
|
for _category, items in raw_data.items():
|
|
assert len(items) % 2 == 0, "Expected pairs of items"
|
|
for i in range(0, len(items), 2):
|
|
flattened_items.add((items[i], items[i + 1]))
|
|
|
|
tuples = [f" {left}u32 => {right},\n" for left, right in sorted(flattened_items)]
|
|
|
|
print(f"{len(tuples)} confusable tuples.")
|
|
|
|
return prelude + "".join(tuples) + postlude
|
|
|
|
|
|
def main() -> None:
|
|
print("Retrieving data...")
|
|
mapping_data = get_mapping_data()
|
|
formatted_data = format_confusables_rs(mapping_data)
|
|
confusables_path = Path(__file__).parent.parent / CONFUSABLES_RS_PATH
|
|
confusables_path.write_text(formatted_data, encoding="utf-8")
|
|
print("Formatting Rust file with cargo fmt...")
|
|
subprocess.check_call(["cargo", "fmt", "--", confusables_path])
|
|
print("Done.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|