mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-27 20:42:10 +00:00
Add fast-path for comment detection (#9808)
## Summary When we fall through to parsing, the comment-detection rule is a significant portion of lint time. This PR adds an additional fast heuristic whereby we abort if a comment contains two consecutive name tokens (via the zero-allocation lexer). For the `ctypeslib.py`, which has a few cases that are now caught by this, it's a 2.5x speedup for the rule (and a 20% speedup for token-based rules).
This commit is contained in:
parent
84aea7f0c8
commit
9781563ef6
8 changed files with 157 additions and 8 deletions
|
@ -1,14 +1,16 @@
|
||||||
/// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py)
|
/// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py)
|
||||||
use aho_corasick::AhoCorasick;
|
use aho_corasick::AhoCorasick;
|
||||||
|
use itertools::Itertools;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use regex::{Regex, RegexSet};
|
use regex::{Regex, RegexSet};
|
||||||
|
|
||||||
use ruff_python_parser::parse_suite;
|
use ruff_python_parser::parse_suite;
|
||||||
|
use ruff_python_trivia::{SimpleTokenKind, SimpleTokenizer};
|
||||||
|
use ruff_text_size::TextSize;
|
||||||
|
|
||||||
static CODE_INDICATORS: Lazy<AhoCorasick> = Lazy::new(|| {
|
static CODE_INDICATORS: Lazy<AhoCorasick> = Lazy::new(|| {
|
||||||
AhoCorasick::new([
|
AhoCorasick::new([
|
||||||
"(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue",
|
"(", ")", "[", "]", "{", "}", ":", "=", "%", "return", "break", "continue", "import",
|
||||||
"import",
|
|
||||||
])
|
])
|
||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
@ -44,6 +46,14 @@ pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fast path: if the comment contains consecutive identifiers, we know it won't parse.
|
||||||
|
let tokenizer = SimpleTokenizer::starts_at(TextSize::default(), line).skip_trivia();
|
||||||
|
if tokenizer.tuple_windows().any(|(first, second)| {
|
||||||
|
first.kind == SimpleTokenKind::Name && second.kind == SimpleTokenKind::Name
|
||||||
|
}) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Ignore task tag comments (e.g., "# TODO(tom): Refactor").
|
// Ignore task tag comments (e.g., "# TODO(tom): Refactor").
|
||||||
if line
|
if line
|
||||||
.split(&[' ', ':', '('])
|
.split(&[' ', ':', '('])
|
||||||
|
@ -123,9 +133,10 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn comment_contains_code_with_print() {
|
fn comment_contains_code_with_print() {
|
||||||
assert!(comment_contains_code("#print", &[]));
|
|
||||||
assert!(comment_contains_code("#print(1)", &[]));
|
assert!(comment_contains_code("#print(1)", &[]));
|
||||||
|
|
||||||
|
assert!(!comment_contains_code("#print", &[]));
|
||||||
|
assert!(!comment_contains_code("#print 1", &[]));
|
||||||
assert!(!comment_contains_code("#to print", &[]));
|
assert!(!comment_contains_code("#to print", &[]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ expression: test_case.tokens()
|
||||||
---
|
---
|
||||||
[
|
[
|
||||||
SimpleToken {
|
SimpleToken {
|
||||||
kind: Other,
|
kind: Name,
|
||||||
range: 0..2,
|
range: 0..2,
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
SimpleToken {
|
||||||
|
kind: Name,
|
||||||
|
range: 0..3,
|
||||||
|
},
|
||||||
|
SimpleToken {
|
||||||
|
kind: Whitespace,
|
||||||
|
range: 3..4,
|
||||||
|
},
|
||||||
|
SimpleToken {
|
||||||
|
kind: Name,
|
||||||
|
range: 4..7,
|
||||||
|
},
|
||||||
|
]
|
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
SimpleToken {
|
||||||
|
kind: Other,
|
||||||
|
range: 0..2,
|
||||||
|
},
|
||||||
|
SimpleToken {
|
||||||
|
kind: Bogus,
|
||||||
|
range: 2..7,
|
||||||
|
},
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
SimpleToken {
|
||||||
|
kind: Name,
|
||||||
|
range: 0..3,
|
||||||
|
},
|
||||||
|
SimpleToken {
|
||||||
|
kind: Other,
|
||||||
|
range: 3..4,
|
||||||
|
},
|
||||||
|
SimpleToken {
|
||||||
|
kind: Bogus,
|
||||||
|
range: 4..8,
|
||||||
|
},
|
||||||
|
]
|
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_trivia/src/tokenizer.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
SimpleToken {
|
||||||
|
kind: Other,
|
||||||
|
range: 0..1,
|
||||||
|
},
|
||||||
|
SimpleToken {
|
||||||
|
kind: Bogus,
|
||||||
|
range: 1..6,
|
||||||
|
},
|
||||||
|
]
|
|
@ -4,7 +4,7 @@ expression: test_case.tokens()
|
||||||
---
|
---
|
||||||
[
|
[
|
||||||
SimpleToken {
|
SimpleToken {
|
||||||
kind: Other,
|
kind: Name,
|
||||||
range: 0..6,
|
range: 0..6,
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
|
@ -182,7 +182,7 @@ fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
|
||||||
"case" => SimpleTokenKind::Case,
|
"case" => SimpleTokenKind::Case,
|
||||||
"with" => SimpleTokenKind::With,
|
"with" => SimpleTokenKind::With,
|
||||||
"yield" => SimpleTokenKind::Yield,
|
"yield" => SimpleTokenKind::Yield,
|
||||||
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
_ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -467,6 +467,9 @@ pub enum SimpleTokenKind {
|
||||||
/// `yield`
|
/// `yield`
|
||||||
Yield,
|
Yield,
|
||||||
|
|
||||||
|
/// An identifier or keyword.
|
||||||
|
Name,
|
||||||
|
|
||||||
/// Any other non trivia token.
|
/// Any other non trivia token.
|
||||||
Other,
|
Other,
|
||||||
|
|
||||||
|
@ -566,11 +569,43 @@ impl<'a> SimpleTokenizer<'a> {
|
||||||
let range = TextRange::at(self.offset, token_len);
|
let range = TextRange::at(self.offset, token_len);
|
||||||
let kind = to_keyword_or_other(&self.source[range]);
|
let kind = to_keyword_or_other(&self.source[range]);
|
||||||
|
|
||||||
if kind == SimpleTokenKind::Other {
|
// If the next character is a quote, we may be in a string prefix. For example:
|
||||||
|
// `f"foo`.
|
||||||
|
if kind == SimpleTokenKind::Name
|
||||||
|
&& matches!(self.cursor.first(), '"' | '\'')
|
||||||
|
&& matches!(
|
||||||
|
&self.source[range],
|
||||||
|
"B" | "BR"
|
||||||
|
| "Br"
|
||||||
|
| "F"
|
||||||
|
| "FR"
|
||||||
|
| "Fr"
|
||||||
|
| "R"
|
||||||
|
| "RB"
|
||||||
|
| "RF"
|
||||||
|
| "Rb"
|
||||||
|
| "Rf"
|
||||||
|
| "U"
|
||||||
|
| "b"
|
||||||
|
| "bR"
|
||||||
|
| "br"
|
||||||
|
| "f"
|
||||||
|
| "fR"
|
||||||
|
| "fr"
|
||||||
|
| "r"
|
||||||
|
| "rB"
|
||||||
|
| "rF"
|
||||||
|
| "rb"
|
||||||
|
| "rf"
|
||||||
|
| "u"
|
||||||
|
)
|
||||||
|
{
|
||||||
self.bogus = true;
|
self.bogus = true;
|
||||||
}
|
SimpleTokenKind::Other
|
||||||
|
} else {
|
||||||
kind
|
kind
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
|
// Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
|
||||||
// whitespace.
|
// whitespace.
|
||||||
|
@ -1153,6 +1188,45 @@ mod tests {
|
||||||
test_case.assert_reverse_tokenization();
|
test_case.assert_reverse_tokenization();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn string_with_kind() {
|
||||||
|
let source = "f'foo'";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
|
||||||
|
// note: not reversible: [other, bogus] vs [bogus, other]
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn string_with_byte_kind() {
|
||||||
|
let source = "BR'foo'";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
|
||||||
|
// note: not reversible: [other, bogus] vs [bogus, other]
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn string_with_invalid_kind() {
|
||||||
|
let source = "abc'foo'";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
|
||||||
|
// note: not reversible: [other, bogus] vs [bogus, other]
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn identifier_starting_with_string_kind() {
|
||||||
|
let source = "foo bar";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
test_case.assert_reverse_tokenization();
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ignore_word_with_only_id_continuing_chars() {
|
fn ignore_word_with_only_id_continuing_chars() {
|
||||||
let source = "555";
|
let source = "555";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue