Add support for multi-character operator tokens to SimpleTokenizer (#6563)

## Summary Allows for proper lexing of tokens like `->`. The main challenge is to ensure that our forward and backwards representations are the same for cases like `===`. Specifically, we want that to lex as `==` followed by `=` regardless of whether it's a forwards or backwards lex. To do so, we identify the range of the sequential characters (the full span of `===`), lex it forwards, then return the last token. ## Test Plan `cargo test`
2025-09-27 12:29:28 +00:00 · 2023-08-16 09:09:19 -04:00 · 2023-08-16 09:09:19 -04:00 · 86ccdcc9d9
commit 86ccdcc9d9
parent e28858bb29
8 changed files with 538 additions and 149 deletions
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@ -951,39 +951,11 @@ fn handle_dict_unpacking_comment<'a>(

    // if the remaining tokens from the previous node are exactly `**`,
    // re-assign the comment to the one that follows the stars
-    let mut count = 0u32;
-
-    // we start from the preceding node but we skip its token
-    if let Some(token) = tokens.next() {
-        // The Keyword case
-        if token.kind == SimpleTokenKind::Star {
-            count += 1;
+    if tokens.any(|token| token.kind == SimpleTokenKind::DoubleStar) {
+        CommentPlacement::trailing(following, comment)
    } else {
-            // The dict case
-            debug_assert!(
-                matches!(
-                    token,
-                    SimpleToken {
-                        kind: SimpleTokenKind::LBrace
-                            | SimpleTokenKind::Comma
-                            | SimpleTokenKind::Colon,
-                        ..
-                    }
-                ),
-                "{token:?}",
-            );
-        }
-    }
-
-    for token in tokens {
-        debug_assert!(token.kind == SimpleTokenKind::Star, "Expected star token");
-        count += 1;
-    }
-    if count == 2 {
-        return CommentPlacement::trailing(following, comment);
-    }
-
        CommentPlacement::Default(comment)
+    }
 }

 /// Own line comments coming after the node are always dangling comments
--- a/crates/ruff_python_trivia/src/cursor.rs
+++ b/crates/ruff_python_trivia/src/cursor.rs
@ -30,6 +30,14 @@ impl<'a> Cursor<'a> {
        self.chars.clone().next().unwrap_or(EOF_CHAR)
    }

+    /// Peeks the second character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the position is past the end of the file.
+    pub fn second(&self) -> char {
+        let mut chars = self.chars.clone();
+        chars.next();
+        chars.next().unwrap_or(EOF_CHAR)
+    }
+
    /// Peeks the next character from the input stream without consuming it.
    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
    pub fn last(&self) -> char {
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_characters.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_characters.snap
@ -1,46 +0,0 @@
---
-source: crates/ruff_python_trivia/src/tokenizer.rs
-expression: test_case.tokens()
---
-[
-    SimpleToken {
-        kind: Minus,
-        range: 0..1,
-    },
-    SimpleToken {
-        kind: Greater,
-        range: 1..2,
-    },
-    SimpleToken {
-        kind: Whitespace,
-        range: 2..3,
-    },
-    SimpleToken {
-        kind: Star,
-        range: 3..4,
-    },
-    SimpleToken {
-        kind: Equals,
-        range: 4..5,
-    },
-    SimpleToken {
-        kind: Whitespace,
-        range: 5..6,
-    },
-    SimpleToken {
-        kind: LParen,
-        range: 6..7,
-    },
-    SimpleToken {
-        kind: Tilde,
-        range: 7..8,
-    },
-    SimpleToken {
-        kind: Equals,
-        range: 8..9,
-    },
-    SimpleToken {
-        kind: RParen,
-        range: 9..10,
-    },
-]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_eq.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_eq.snap
@ -0,0 +1,14 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: EqEqual,
+        range: 0..2,
+    },
+    SimpleToken {
+        kind: Equals,
+        range: 2..3,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_invalid_operators.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_invalid_operators.snap
@ -0,0 +1,22 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: RArrow,
+        range: 0..2,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 2..3,
+    },
+    SimpleToken {
+        kind: Other,
+        range: 3..4,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 4..5,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_not_eq.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_not_eq.snap
@ -0,0 +1,14 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: NotEqual,
+        range: 0..2,
+    },
+    SimpleToken {
+        kind: Equals,
+        range: 2..3,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_operators.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_operators.snap
@ -0,0 +1,106 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: RArrow,
+        range: 0..2,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 2..3,
+    },
+    SimpleToken {
+        kind: StarEqual,
+        range: 3..5,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 5..6,
+    },
+    SimpleToken {
+        kind: LParen,
+        range: 6..7,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 7..8,
+    },
+    SimpleToken {
+        kind: MinusEqual,
+        range: 8..10,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 10..11,
+    },
+    SimpleToken {
+        kind: RParen,
+        range: 11..12,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 12..13,
+    },
+    SimpleToken {
+        kind: Tilde,
+        range: 13..14,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 14..15,
+    },
+    SimpleToken {
+        kind: DoubleSlash,
+        range: 15..17,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 17..18,
+    },
+    SimpleToken {
+        kind: DoubleStar,
+        range: 18..20,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 20..21,
+    },
+    SimpleToken {
+        kind: DoubleStarEqual,
+        range: 21..24,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 24..25,
+    },
+    SimpleToken {
+        kind: Circumflex,
+        range: 25..26,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 26..27,
+    },
+    SimpleToken {
+        kind: CircumflexEqual,
+        range: 27..29,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 29..30,
+    },
+    SimpleToken {
+        kind: Vbar,
+        range: 30..31,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 31..32,
+    },
+    SimpleToken {
+        kind: VbarEqual,
+        range: 32..34,
+    },
+]
--- a/crates/ruff_python_trivia/src/tokenizer.rs
+++ b/crates/ruff_python_trivia/src/tokenizer.rs
@ -1,7 +1,8 @@
 use memchr::{memchr2, memchr3, memrchr3_iter};
-use ruff_text_size::{TextLen, TextRange, TextSize};
 use unic_ucd_ident::{is_xid_continue, is_xid_start};

+use ruff_text_size::{TextLen, TextRange, TextSize};
+
 use crate::{is_python_whitespace, Cursor};

 /// Searches for the first non-trivia character in `range`.
@ -213,6 +214,78 @@ pub enum SimpleTokenKind {
    /// `~`
    Tilde,

+    /// `==`
+    EqEqual,
+
+    /// `!=`
+    NotEqual,
+
+    /// `<=`
+    LessEqual,
+
+    /// `>=`
+    GreaterEqual,
+
+    /// `<<`
+    LeftShift,
+
+    /// `>>`
+    RightShift,
+
+    /// `**`
+    DoubleStar,
+
+    /// `**=`
+    DoubleStarEqual,
+
+    /// `+=`
+    PlusEqual,
+
+    /// `-=`
+    MinusEqual,
+
+    /// `*=`
+    StarEqual,
+
+    /// `/=`
+    SlashEqual,
+
+    /// `%=`
+    PercentEqual,
+
+    /// `&=`
+    AmperEqual,
+
+    /// `|=`
+    VbarEqual,
+
+    /// `^=`
+    CircumflexEqual,
+
+    /// `<<=`
+    LeftShiftEqual,
+
+    /// `>>=`
+    RightShiftEqual,
+
+    /// `//`
+    DoubleSlash,
+
+    /// `//=`
+    DoubleSlashEqual,
+
+    /// `:=`
+    ColonEqual,
+
+    /// `...`
+    Ellipsis,
+
+    /// `@=`
+    AtEqual,
+
+    /// `->`
+    RArrow,
+
    /// `and`
    And,

@ -326,35 +399,6 @@ pub enum SimpleTokenKind {
 }

 impl SimpleTokenKind {
-    const fn from_non_trivia_char(c: char) -> SimpleTokenKind {
-        match c {
-            '(' => SimpleTokenKind::LParen,
-            ')' => SimpleTokenKind::RParen,
-            '[' => SimpleTokenKind::LBracket,
-            ']' => SimpleTokenKind::RBracket,
-            '{' => SimpleTokenKind::LBrace,
-            '}' => SimpleTokenKind::RBrace,
-            ',' => SimpleTokenKind::Comma,
-            ':' => SimpleTokenKind::Colon,
-            ';' => SimpleTokenKind::Semi,
-            '/' => SimpleTokenKind::Slash,
-            '*' => SimpleTokenKind::Star,
-            '.' => SimpleTokenKind::Dot,
-            '+' => SimpleTokenKind::Plus,
-            '-' => SimpleTokenKind::Minus,
-            '=' => SimpleTokenKind::Equals,
-            '>' => SimpleTokenKind::Greater,
-            '<' => SimpleTokenKind::Less,
-            '%' => SimpleTokenKind::Percent,
-            '&' => SimpleTokenKind::Ampersand,
-            '^' => SimpleTokenKind::Circumflex,
-            '|' => SimpleTokenKind::Vbar,
-            '@' => SimpleTokenKind::At,
-            '~' => SimpleTokenKind::Tilde,
-            _ => SimpleTokenKind::Other,
-        }
-    }
-
    const fn is_trivia(self) -> bool {
        matches!(
            self,
@ -478,6 +522,20 @@ impl<'a> SimpleTokenizer<'a> {
        }

        let kind = match first {
+            // Keywords and identifiers
+            c if is_identifier_start(c) => {
+                self.cursor.eat_while(is_identifier_continuation);
+                let token_len = self.cursor.token_len();
+
+                let range = TextRange::at(self.offset, token_len);
+                let kind = self.to_keyword_or_other(range);
+
+                if kind == SimpleTokenKind::Other {
+                    self.bogus = true;
+                }
+                kind
+            }
+
            ' ' | '\t' => {
                self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
                SimpleTokenKind::Whitespace
@ -497,21 +555,156 @@ impl<'a> SimpleTokenizer<'a> {

            '\\' => SimpleTokenKind::Continuation,

-            c => {
-                let kind = if is_identifier_start(c) {
-                    self.cursor.eat_while(is_identifier_continuation);
-                    let token_len = self.cursor.token_len();
-
-                    let range = TextRange::at(self.offset, token_len);
-                    self.to_keyword_or_other(range)
+            // Non-trivia, non-keyword tokens
+            '=' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::EqEqual
                } else {
-                    SimpleTokenKind::from_non_trivia_char(c)
-                };
-
-                if kind == SimpleTokenKind::Other {
-                    self.bogus = true;
+                    SimpleTokenKind::Equals
                }
-                kind
+            }
+            '+' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::PlusEqual
+                } else {
+                    SimpleTokenKind::Plus
+                }
+            }
+            '*' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::StarEqual
+                } else if self.cursor.eat_char('*') {
+                    if self.cursor.eat_char('=') {
+                        SimpleTokenKind::DoubleStarEqual
+                    } else {
+                        SimpleTokenKind::DoubleStar
+                    }
+                } else {
+                    SimpleTokenKind::Star
+                }
+            }
+            '/' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::SlashEqual
+                } else if self.cursor.eat_char('/') {
+                    if self.cursor.eat_char('=') {
+                        SimpleTokenKind::DoubleSlashEqual
+                    } else {
+                        SimpleTokenKind::DoubleSlash
+                    }
+                } else {
+                    SimpleTokenKind::Slash
+                }
+            }
+            '%' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::PercentEqual
+                } else {
+                    SimpleTokenKind::Percent
+                }
+            }
+            '|' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::VbarEqual
+                } else {
+                    SimpleTokenKind::Vbar
+                }
+            }
+            '^' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::CircumflexEqual
+                } else {
+                    SimpleTokenKind::Circumflex
+                }
+            }
+            '&' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::AmperEqual
+                } else {
+                    SimpleTokenKind::Ampersand
+                }
+            }
+            '-' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::MinusEqual
+                } else if self.cursor.eat_char('>') {
+                    SimpleTokenKind::RArrow
+                } else {
+                    SimpleTokenKind::Minus
+                }
+            }
+            '@' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::AtEqual
+                } else {
+                    SimpleTokenKind::At
+                }
+            }
+            '!' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::NotEqual
+                } else {
+                    self.bogus = true;
+                    SimpleTokenKind::Other
+                }
+            }
+            '~' => SimpleTokenKind::Tilde,
+            ':' => {
+                if self.cursor.eat_char('=') {
+                    SimpleTokenKind::ColonEqual
+                } else {
+                    SimpleTokenKind::Colon
+                }
+            }
+            ';' => SimpleTokenKind::Semi,
+            '<' => {
+                if self.cursor.eat_char('<') {
+                    if self.cursor.eat_char('=') {
+                        SimpleTokenKind::LeftShiftEqual
+                    } else {
+                        SimpleTokenKind::LeftShift
+                    }
+                } else if self.cursor.eat_char('=') {
+                    SimpleTokenKind::LessEqual
+                } else {
+                    SimpleTokenKind::Less
+                }
+            }
+            '>' => {
+                if self.cursor.eat_char('>') {
+                    if self.cursor.eat_char('=') {
+                        SimpleTokenKind::RightShiftEqual
+                    } else {
+                        SimpleTokenKind::RightShift
+                    }
+                } else if self.cursor.eat_char('=') {
+                    SimpleTokenKind::GreaterEqual
+                } else {
+                    SimpleTokenKind::Greater
+                }
+            }
+            ',' => SimpleTokenKind::Comma,
+            '.' => {
+                if self.cursor.first() == '.' && self.cursor.second() == '.' {
+                    self.cursor.bump();
+                    self.cursor.bump();
+                    SimpleTokenKind::Ellipsis
+                } else {
+                    SimpleTokenKind::Dot
+                }
+            }
+
+            // Bracket tokens
+            '(' => SimpleTokenKind::LParen,
+            ')' => SimpleTokenKind::RParen,
+            '[' => SimpleTokenKind::LBracket,
+            ']' => SimpleTokenKind::RBracket,
+            '{' => SimpleTokenKind::LBrace,
+            '}' => SimpleTokenKind::RBrace,
+
+            _ => {
+                self.bogus = true;
+                SimpleTokenKind::Other
            }
        };

@ -612,10 +805,10 @@ impl<'a> SimpleTokenizer<'a> {
                    }

                    SimpleTokenKind::Comment
-                } else if c == '\\' {
-                    SimpleTokenKind::Continuation
                } else {
-                    let kind = if is_identifier_continuation(c) {
+                    match c {
+                        // Keywords and identifiers
+                        c if is_identifier_continuation(c) => {
                            // if we only have identifier continuations but no start (e.g. 555) we
                            // don't want to consume the chars, so in that case, we want to rewind the
                            // cursor to here
@ -633,17 +826,87 @@ impl<'a> SimpleTokenizer<'a> {
                                self.to_keyword_or_other(range)
                            } else {
                                self.cursor = savepoint;
+                                self.bogus = true;
                                SimpleTokenKind::Other
                            }
-                    } else {
-                        SimpleTokenKind::from_non_trivia_char(c)
-                    };
-
-                    if kind == SimpleTokenKind::Other {
-                        self.bogus = true;
                        }

-                    kind
+                        // Non-trivia tokens that are unambiguous when lexing backwards.
+                        // In other words: these are characters that _don't_ appear at the
+                        // end of a multi-character token (like `!=`).
+                        '\\' => SimpleTokenKind::Continuation,
+                        ':' => SimpleTokenKind::Colon,
+                        '~' => SimpleTokenKind::Tilde,
+                        '%' => SimpleTokenKind::Percent,
+                        '|' => SimpleTokenKind::Vbar,
+                        ',' => SimpleTokenKind::Comma,
+                        ';' => SimpleTokenKind::Semi,
+                        '(' => SimpleTokenKind::LParen,
+                        ')' => SimpleTokenKind::RParen,
+                        '[' => SimpleTokenKind::LBracket,
+                        ']' => SimpleTokenKind::RBracket,
+                        '{' => SimpleTokenKind::LBrace,
+                        '}' => SimpleTokenKind::RBrace,
+                        '&' => SimpleTokenKind::Ampersand,
+                        '^' => SimpleTokenKind::Circumflex,
+                        '+' => SimpleTokenKind::Plus,
+                        '-' => SimpleTokenKind::Minus,
+
+                        // Non-trivia tokens that _are_ ambiguous when lexing backwards.
+                        // In other words: these are characters that _might_ mark the end
+                        // of a multi-character token (like `!=` or `->` or `//` or `**`).
+                        '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
+                            // This could be a single-token token, like `+` in `x + y`, or a
+                            // multi-character token, like `+=` in `x += y`. It could also be a sequence
+                            // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
+                            // important that we produce the same token stream when lexing backwards as
+                            // we do when lexing forwards. So, identify the range of the sequence, lex
+                            // forwards, and return the last token.
+                            let mut cursor = self.cursor.clone();
+                            cursor.eat_back_while(|c| {
+                                matches!(
+                                    c,
+                                    ':' | '~'
+                                        | '%'
+                                        | '|'
+                                        | '&'
+                                        | '^'
+                                        | '+'
+                                        | '-'
+                                        | '='
+                                        | '*'
+                                        | '/'
+                                        | '@'
+                                        | '!'
+                                        | '<'
+                                        | '>'
+                                        | '.'
+                                )
+                            });
+
+                            let token_len = cursor.token_len();
+                            let range = TextRange::at(self.back_offset - token_len, token_len);
+
+                            let forward_lexer = Self::new(self.source, range);
+                            if let Some(token) = forward_lexer.last() {
+                                // If the token spans multiple characters, bump the cursor. Note,
+                                // though, that we already bumped the cursor to past the last character
+                                // in the token at the very start of `next_token_back`.
+                                for _ in self.source[token.range].chars().rev().skip(1) {
+                                    self.cursor.bump_back().unwrap();
+                                }
+                                token.kind()
+                            } else {
+                                self.bogus = true;
+                                SimpleTokenKind::Other
+                            }
+                        }
+
+                        _ => {
+                            self.bogus = true;
+                            SimpleTokenKind::Other
+                        }
+                    }
                }
            }
        };
@ -871,6 +1134,7 @@ impl QuoteKind {
 #[cfg(test)]
 mod tests {
    use insta::assert_debug_snapshot;
+
    use ruff_text_size::{TextLen, TextRange, TextSize};

    use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
@ -946,6 +1210,30 @@ mod tests {
        test_case.assert_reverse_tokenization();
    }

+    #[test]
+    fn tokenize_eq() {
+        // Should tokenize as `==`, then `=`, regardless of whether we're lexing forwards or
+        // backwards.
+        let source = "===";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_not_eq() {
+        // Should tokenize as `!=`, then `=`, regardless of whether we're lexing forwards or
+        // backwards.
+        let source = "!==";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
    #[test]
    fn tokenize_continuation() {
        let source = "( \\\n )";
@ -957,8 +1245,8 @@ mod tests {
    }

    #[test]
-    fn tokenize_characters() {
-        let source = "-> *= (~=)";
+    fn tokenize_operators() {
+        let source = "-> *= ( -= ) ~ // ** **= ^ ^= | |=";

        let test_case = tokenize(source);

@ -966,6 +1254,17 @@ mod tests {
        test_case.assert_reverse_tokenization();
    }

+    #[test]
+    fn tokenize_invalid_operators() {
+        let source = "-> $=";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
+    }
+
    #[test]
    fn tricky_unicode() {
        let source = "មុ";