Add fast-path for comment detection (#9808)

## Summary

When we fall through to parsing, the comment-detection rule is a
significant portion of lint time. This PR adds an additional fast
heuristic whereby we abort if a comment contains two consecutive name
tokens (via the zero-allocation lexer). For the `ctypeslib.py`, which
has a few cases that are now caught by this, it's a 2.5x speedup for the
rule (and a 20% speedup for token-based rules).
This commit is contained in:
Charlie Marsh 2024-02-05 08:00:18 -08:00 committed by GitHub
parent 84aea7f0c8
commit 9781563ef6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 157 additions and 8 deletions

View file

@ -182,7 +182,7 @@ fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
"case" => SimpleTokenKind::Case,
"with" => SimpleTokenKind::With,
"yield" => SimpleTokenKind::Yield,
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
_ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
}
}
@ -467,6 +467,9 @@ pub enum SimpleTokenKind {
/// `yield`
Yield,
/// An identifier or keyword.
Name,
/// Any other non trivia token.
Other,
@ -566,10 +569,42 @@ impl<'a> SimpleTokenizer<'a> {
let range = TextRange::at(self.offset, token_len);
let kind = to_keyword_or_other(&self.source[range]);
if kind == SimpleTokenKind::Other {
// If the next character is a quote, we may be in a string prefix. For example:
// `f"foo`.
if kind == SimpleTokenKind::Name
&& matches!(self.cursor.first(), '"' | '\'')
&& matches!(
&self.source[range],
"B" | "BR"
| "Br"
| "F"
| "FR"
| "Fr"
| "R"
| "RB"
| "RF"
| "Rb"
| "Rf"
| "U"
| "b"
| "bR"
| "br"
| "f"
| "fR"
| "fr"
| "r"
| "rB"
| "rF"
| "rb"
| "rf"
| "u"
)
{
self.bogus = true;
SimpleTokenKind::Other
} else {
kind
}
kind
}
// Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
@ -1153,6 +1188,45 @@ mod tests {
test_case.assert_reverse_tokenization();
}
#[test]
fn string_with_kind() {
let source = "f'foo'";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
// note: not reversible: [other, bogus] vs [bogus, other]
}
#[test]
fn string_with_byte_kind() {
let source = "BR'foo'";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
// note: not reversible: [other, bogus] vs [bogus, other]
}
#[test]
fn string_with_invalid_kind() {
let source = "abc'foo'";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
// note: not reversible: [other, bogus] vs [bogus, other]
}
#[test]
fn identifier_starting_with_string_kind() {
let source = "foo bar";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn ignore_word_with_only_id_continuing_chars() {
let source = "555";