Add fast-path for comment detection (#9808)

## Summary When we fall through to parsing, the comment-detection rule is a significant portion of lint time. This PR adds an additional fast heuristic whereby we abort if a comment contains two consecutive name tokens (via the zero-allocation lexer). For the `ctypeslib.py`, which has a few cases that are now caught by this, it's a 2.5x speedup for the rule (and a 20% speedup for token-based rules).
2025-10-05 16:10:36 +00:00 · 2024-02-05 08:00:18 -08:00 · 2024-02-05 08:00:18 -08:00 · 9781563ef6
commit 9781563ef6
parent 84aea7f0c8
8 changed files with 157 additions and 8 deletions
--- a/crates/ruff_python_trivia/src/tokenizer.rs
+++ b/crates/ruff_python_trivia/src/tokenizer.rs
@ -182,7 +182,7 @@ fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
        "case" => SimpleTokenKind::Case,
        "with" => SimpleTokenKind::With,
        "yield" => SimpleTokenKind::Yield,
-        _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+        _ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
    }
 }

@ -467,6 +467,9 @@ pub enum SimpleTokenKind {
    /// `yield`
    Yield,

+    /// An identifier or keyword.
+    Name,
+
    /// Any other non trivia token.
    Other,

@ -566,10 +569,42 @@ impl<'a> SimpleTokenizer<'a> {
                let range = TextRange::at(self.offset, token_len);
                let kind = to_keyword_or_other(&self.source[range]);

-                if kind == SimpleTokenKind::Other {
+                // If the next character is a quote, we may be in a string prefix. For example:
+                // `f"foo`.
+                if kind == SimpleTokenKind::Name
+                    && matches!(self.cursor.first(), '"' | '\'')
+                    && matches!(
+                        &self.source[range],
+                        "B" | "BR"
+                            | "Br"
+                            | "F"
+                            | "FR"
+                            | "Fr"
+                            | "R"
+                            | "RB"
+                            | "RF"
+                            | "Rb"
+                            | "Rf"
+                            | "U"
+                            | "b"
+                            | "bR"
+                            | "br"
+                            | "f"
+                            | "fR"
+                            | "fr"
+                            | "r"
+                            | "rB"
+                            | "rF"
+                            | "rb"
+                            | "rf"
+                            | "u"
+                    )
+                {
                    self.bogus = true;
+                    SimpleTokenKind::Other
+                } else {
+                    kind
                }
-                kind
            }

            // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
@ -1153,6 +1188,45 @@ mod tests {
        test_case.assert_reverse_tokenization();
    }

+    #[test]
+    fn string_with_kind() {
+        let source = "f'foo'";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus] vs [bogus, other]
+    }
+
+    #[test]
+    fn string_with_byte_kind() {
+        let source = "BR'foo'";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus] vs [bogus, other]
+    }
+
+    #[test]
+    fn string_with_invalid_kind() {
+        let source = "abc'foo'";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus] vs [bogus, other]
+    }
+
+    #[test]
+    fn identifier_starting_with_string_kind() {
+        let source = "foo bar";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
    #[test]
    fn ignore_word_with_only_id_continuing_chars() {
        let source = "555";