mirror of
https://github.com/astral-sh/ruff.git
synced 2025-11-03 05:13:00 +00:00
Use match instead of phf for confusable lookup (#5953)
I don't know whether we want to make this change but here's some data...
Binary size:
- `main`: 30,384
- `charlie/match-phf`: 30,416
llvm-lines:
- `main`: 1,784,148
- `charlie/match-phf`: 1,789,877
llvm-lines and binary size are both unchanged (or, by < 5) when moving
from `u8` to `u32` return types, and even when moving to `char` keys and
values. I didn't expect this, but I'm not very knowledgable on this
topic.
Performance:
```
Confusables/match/src time: [4.9102 µs 4.9352 µs 4.9777 µs]
change: [+1.7469% +2.2421% +2.8710%] (p = 0.00 < 0.05)
Performance has regressed.
Found 12 outliers among 100 measurements (12.00%)
2 (2.00%) low mild
4 (4.00%) high mild
6 (6.00%) high severe
Confusables/match-with-skip/src
time: [2.0676 µs 2.0945 µs 2.1317 µs]
change: [+0.9384% +1.6000% +2.3920%] (p = 0.00 < 0.05)
Change within noise threshold.
Found 8 outliers among 100 measurements (8.00%)
3 (3.00%) high mild
5 (5.00%) high severe
Confusables/phf/src time: [31.087 µs 31.188 µs 31.305 µs]
change: [+1.9262% +2.2188% +2.5496%] (p = 0.00 < 0.05)
Performance has regressed.
Found 15 outliers among 100 measurements (15.00%)
3 (3.00%) low mild
6 (6.00%) high mild
6 (6.00%) high severe
Confusables/phf-with-skip/src
time: [2.0470 µs 2.0486 µs 2.0502 µs]
change: [-0.3093% -0.1446% +0.0106%] (p = 0.08 > 0.05)
No change in performance detected.
Found 4 outliers among 100 measurements (4.00%)
2 (2.00%) high mild
2 (2.00%) high severe
```
The `-with-skip` variants add our optimization which first checks
whether the character is ASCII. So `match` is way, way faster than PHF,
but it tends not to matter since almost all source code is ASCII anyway.
This commit is contained in:
parent
700c816fd5
commit
574c0e0105
5 changed files with 1597 additions and 1611 deletions
15
Cargo.lock
generated
15
Cargo.lock
generated
|
|
@ -1524,7 +1524,6 @@ version = "0.11.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
|
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"phf_macros",
|
|
||||||
"phf_shared",
|
"phf_shared",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -1548,19 +1547,6 @@ dependencies = [
|
||||||
"rand",
|
"rand",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "phf_macros"
|
|
||||||
version = "0.11.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
|
|
||||||
dependencies = [
|
|
||||||
"phf_generator",
|
|
||||||
"phf_shared",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 2.0.23",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf_shared"
|
name = "phf_shared"
|
||||||
version = "0.11.2"
|
version = "0.11.2"
|
||||||
|
|
@ -1916,7 +1902,6 @@ dependencies = [
|
||||||
"path-absolutize",
|
"path-absolutize",
|
||||||
"pathdiff",
|
"pathdiff",
|
||||||
"pep440_rs",
|
"pep440_rs",
|
||||||
"phf",
|
|
||||||
"pretty_assertions",
|
"pretty_assertions",
|
||||||
"pyproject-toml",
|
"pyproject-toml",
|
||||||
"quick-junit",
|
"quick-junit",
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,6 @@ path-absolutize = { workspace = true, features = [
|
||||||
] }
|
] }
|
||||||
pathdiff = { version = "0.2.1" }
|
pathdiff = { version = "0.2.1" }
|
||||||
pep440_rs = { version = "0.3.1", features = ["serde"] }
|
pep440_rs = { version = "0.3.1", features = ["serde"] }
|
||||||
phf = { version = "0.11", features = ["macros"] }
|
|
||||||
pyproject-toml = { version = "0.6.0" }
|
pyproject-toml = { version = "0.6.0" }
|
||||||
quick-junit = { version = "0.3.2" }
|
quick-junit = { version = "0.3.2" }
|
||||||
regex = { workspace = true }
|
regex = { workspace = true }
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ use ruff_macros::{derive_message_formats, violation};
|
||||||
use ruff_python_ast::source_code::Locator;
|
use ruff_python_ast::source_code::Locator;
|
||||||
|
|
||||||
use crate::registry::AsRule;
|
use crate::registry::AsRule;
|
||||||
use crate::rules::ruff::rules::confusables::CONFUSABLES;
|
use crate::rules::ruff::rules::confusables::confusable;
|
||||||
use crate::rules::ruff::rules::Context;
|
use crate::rules::ruff::rules::Context;
|
||||||
use crate::settings::Settings;
|
use crate::settings::Settings;
|
||||||
|
|
||||||
|
|
@ -193,7 +193,7 @@ pub(crate) fn ambiguous_unicode_character(
|
||||||
// Check if the boundary character is itself an ambiguous unicode character, in which
|
// Check if the boundary character is itself an ambiguous unicode character, in which
|
||||||
// case, it's always included as a diagnostic.
|
// case, it's always included as a diagnostic.
|
||||||
if !current_char.is_ascii() {
|
if !current_char.is_ascii() {
|
||||||
if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
|
if let Some(representant) = confusable(current_char as u32) {
|
||||||
let candidate = Candidate::new(
|
let candidate = Candidate::new(
|
||||||
TextSize::try_from(relative_offset).unwrap() + range.start(),
|
TextSize::try_from(relative_offset).unwrap() + range.start(),
|
||||||
current_char,
|
current_char,
|
||||||
|
|
@ -207,7 +207,7 @@ pub(crate) fn ambiguous_unicode_character(
|
||||||
} else if current_char.is_ascii() {
|
} else if current_char.is_ascii() {
|
||||||
// The current word contains at least one ASCII character.
|
// The current word contains at least one ASCII character.
|
||||||
word_flags |= WordFlags::ASCII;
|
word_flags |= WordFlags::ASCII;
|
||||||
} else if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
|
} else if let Some(representant) = confusable(current_char as u32) {
|
||||||
// The current word contains an ambiguous unicode character.
|
// The current word contains an ambiguous unicode character.
|
||||||
word_candidates.push(Candidate::new(
|
word_candidates.push(Candidate::new(
|
||||||
TextSize::try_from(relative_offset).unwrap() + range.start(),
|
TextSize::try_from(relative_offset).unwrap() + range.start(),
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -9,16 +9,16 @@ CONFUSABLES_RS_PATH = "crates/ruff/src/rules/ruff/rules/confusables.rs"
|
||||||
AMBIGUOUS_JSON_URL = "https://raw.githubusercontent.com/hediet/vscode-unicode-data/main/out/ambiguous.json"
|
AMBIGUOUS_JSON_URL = "https://raw.githubusercontent.com/hediet/vscode-unicode-data/main/out/ambiguous.json"
|
||||||
|
|
||||||
prelude = """
|
prelude = """
|
||||||
/// This file is auto-generated by `scripts/update_ambiguous_characters.py`.
|
//! This file is auto-generated by `scripts/update_ambiguous_characters.py`.
|
||||||
use phf::phf_map;
|
|
||||||
|
|
||||||
/// Via: <https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json>
|
/// Via: <https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json>
|
||||||
/// See: <https://github.com/microsoft/vscode/blob/095ddabc52b82498ee7f718a34f9dd11d59099a8/src/vs/base/common/strings.ts#L1094>
|
/// See: <https://github.com/microsoft/vscode/blob/095ddabc52b82498ee7f718a34f9dd11d59099a8/src/vs/base/common/strings.ts#L1094>
|
||||||
#[allow(clippy::unreadable_literal)]
|
pub(crate) fn confusable(c: u32) -> Option<u8> {
|
||||||
pub(crate) static CONFUSABLES: phf::Map<u32, u8> = phf_map! {
|
let result = match c {
|
||||||
|
|
||||||
""".lstrip()
|
""".lstrip()
|
||||||
|
|
||||||
postlude = """};"""
|
postlude = """_ => return None, }; Some(result)}"""
|
||||||
|
|
||||||
|
|
||||||
def get_mapping_data() -> dict:
|
def get_mapping_data() -> dict:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue