Expand SimpleTokenizer to all keywords and single-character tokens (#6518)

## Summary For #6485, I need to be able to use the `SimpleTokenizer` to lex the space between any two adjacent expressions (i.e., the space between a preceding and following node). This requires that we support a wider range of keywords (like `and`, to connect the pieces of `x and y`), and some additional single-character tokens (like `-` and `>`, to support `->`). Note that the `SimpleTokenizer` does not support multi-character tokens, so the `->` in a function signature is lexed as a `-` followed by a `>` -- but this is fine for our purposes.
2025-09-29 13:24:57 +00:00 · 2023-08-14 10:35:31 -04:00 · 2023-08-14 10:35:31 -04:00 · 3711f8ad59
commit 3711f8ad59
parent a7cf8f0b77
2 changed files with 217 additions and 12 deletions
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_characters.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_characters.snap
@ -0,0 +1,46 @@
 ---
 source: crates/ruff_python_trivia/src/tokenizer.rs
 expression: test_case.tokens()
 ---
 [
    SimpleToken {
        kind: Minus,
        range: 0..1,
    },
    SimpleToken {
        kind: Greater,
        range: 1..2,
    },
    SimpleToken {
        kind: Whitespace,
        range: 2..3,
    },
    SimpleToken {
        kind: Star,
        range: 3..4,
    },
    SimpleToken {
        kind: Equals,
        range: 4..5,
    },
    SimpleToken {
        kind: Whitespace,
        range: 5..6,
    },
    SimpleToken {
        kind: LParen,
        range: 6..7,
    },
    SimpleToken {
        kind: Tilde,
        range: 7..8,
    },
    SimpleToken {
        kind: Equals,
        range: 8..9,
    },
    SimpleToken {
        kind: RParen,
        range: 9..10,
    },
 ]
--- a/crates/ruff_python_trivia/src/tokenizer.rs
+++ b/crates/ruff_python_trivia/src/tokenizer.rs
@ -177,29 +177,141 @@ pub enum SimpleTokenKind {
    /// `.`.
    Dot,
-    /// `else`
+    /// `+`
-    Else,
+    Plus,
-    /// `if`
+    /// `-`
-    If,
+    Minus,
-    /// `elif`
+    /// `=`
-    Elif,
+    Equals,
-    /// `in`
+    /// `>`
-    In,
+    Greater,
    /// `<`
    Less,
    /// `%`
    Percent,
    /// `&`
    Ampersand,
    /// `^`
    Circumflex,
    /// `|`
    Vbar,
    /// `@`
    At,
    /// `~`
    Tilde,
    /// `and`
    And,
    /// `as`
    As,
    /// `assert`
    Assert,
    /// `async`
    Async,
    /// `await`
    Await,
    /// `break`
    Break,
    /// `class`
    Class,
    /// `continue`
    Continue,
    /// `def`
    Def,
    /// `del`
    Del,
    /// `elif`
    Elif,
    /// `else`
    Else,
    /// `except`
    Except,
    /// `finally`
    Finally,
    /// `for`
    For,
    /// `from`
    From,
    /// `global`
    Global,
    /// `if`
    If,
    /// `import`
    Import,
    /// `in`
    In,
    /// `is`
    Is,
    /// `lambda`
    Lambda,
    /// `nonlocal`
    Nonlocal,
    /// `not`
    Not,
    /// `or`
    Or,
    /// `pass`
    Pass,
    /// `raise`
    Raise,
    /// `return`
    Return,
    /// `try`
    Try,
    /// `while`
    While,
    /// `match`
    Match,
    /// `type`
    Type,
    /// `case`
    Case,
    /// `with`
    With,
-    /// `async`
+    /// `yield`
-    Async,
+    Yield,
    /// Any other non trivia token.
    Other,
@ -222,6 +334,17 @@ impl SimpleTokenKind {
            '/' => SimpleTokenKind::Slash,
            '*' => SimpleTokenKind::Star,
            '.' => SimpleTokenKind::Dot,
            '+' => SimpleTokenKind::Plus,
            '-' => SimpleTokenKind::Minus,
            '=' => SimpleTokenKind::Equals,
            '>' => SimpleTokenKind::Greater,
            '<' => SimpleTokenKind::Less,
            '%' => SimpleTokenKind::Percent,
            '&' => SimpleTokenKind::Ampersand,
            '^' => SimpleTokenKind::Circumflex,
            '|' => SimpleTokenKind::Vbar,
            '@' => SimpleTokenKind::At,
            '~' => SimpleTokenKind::Tilde,
            _ => SimpleTokenKind::Other,
        }
    }
@ -289,15 +412,41 @@ impl<'a> SimpleTokenizer<'a> {
    fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
        let source = &self.source[range];
        match source {
            "and" => SimpleTokenKind::And,
            "as" => SimpleTokenKind::As,
            "assert" => SimpleTokenKind::Assert,
            "async" => SimpleTokenKind::Async,
-            "else" => SimpleTokenKind::Else,
+            "await" => SimpleTokenKind::Await,
            "break" => SimpleTokenKind::Break,
            "class" => SimpleTokenKind::Class,
            "continue" => SimpleTokenKind::Continue,
            "def" => SimpleTokenKind::Def,
            "del" => SimpleTokenKind::Del,
            "elif" => SimpleTokenKind::Elif,
            "else" => SimpleTokenKind::Else,
            "except" => SimpleTokenKind::Except,
            "finally" => SimpleTokenKind::Finally,
            "for" => SimpleTokenKind::For,
            "from" => SimpleTokenKind::From,
            "global" => SimpleTokenKind::Global,
            "if" => SimpleTokenKind::If,
            "import" => SimpleTokenKind::Import,
            "in" => SimpleTokenKind::In,
            "is" => SimpleTokenKind::Is,
            "lambda" => SimpleTokenKind::Lambda,
            "nonlocal" => SimpleTokenKind::Nonlocal,
            "not" => SimpleTokenKind::Not,
            "or" => SimpleTokenKind::Or,
            "pass" => SimpleTokenKind::Pass,
            "raise" => SimpleTokenKind::Raise,
            "return" => SimpleTokenKind::Return,
            "try" => SimpleTokenKind::Try,
            "while" => SimpleTokenKind::While,
            "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
            "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
            "case" => SimpleTokenKind::Case,
            "with" => SimpleTokenKind::With,
-            // ...,
+            "yield" => SimpleTokenKind::Yield,
            _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
        }
    }
@ -801,6 +950,16 @@ mod tests {
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_characters() {
        let source = "-> *= (~=)";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tricky_unicode() {
        let source = "មុ";