Use CommentRanges in backwards lexing (#7360)

## Summary The tokenizer was split into a forward and a backwards tokenizer. The backwards tokenizer uses the same names as the forwards ones (e.g. `next_token`). The backwards tokenizer gets the comment ranges that we already built to skip comments. --------- Co-authored-by: Micha Reiser <micha@reiser.io>
2025-08-03 10:23:11 +00:00 · 2023-09-16 05:21:45 +02:00 · 2023-09-16 05:21:45 +02:00 · 2cbe1733c8
commit 2cbe1733c8
parent 1f6e1485f9
41 changed files with 744 additions and 628 deletions
--- a/crates/ruff_python_trivia/Cargo.toml
+++ b/crates/ruff_python_trivia/Cargo.toml
@ -16,8 +16,7 @@ license = { workspace = true }
 ruff_text_size = { path = "../ruff_text_size" }
 ruff_source_file = { path = "../ruff_source_file" }

-memchr = { workspace = true }
-smallvec = { workspace = true }
+itertools = { workspace = true }
 unicode-ident = { workspace = true }

 [dev-dependencies]
--- a/crates/ruff_python_trivia/src/comment_ranges.rs
+++ b/crates/ruff_python_trivia/src/comment_ranges.rs
@ -0,0 +1,71 @@
+use std::fmt::{Debug, Formatter};
+use std::ops::Deref;
+
+use itertools::Itertools;
+
+use ruff_text_size::{Ranged, TextRange};
+
+/// Stores the ranges of comments sorted by [`TextRange::start`] in increasing order. No two ranges are overlapping.
+#[derive(Clone, Default)]
+pub struct CommentRanges {
+    raw: Vec<TextRange>,
+}
+
+impl CommentRanges {
+    pub fn new(ranges: Vec<TextRange>) -> Self {
+        Self { raw: ranges }
+    }
+
+    /// Returns `true` if the given range includes a comment.
+    pub fn intersects(&self, target: TextRange) -> bool {
+        self.raw
+            .binary_search_by(|range| {
+                if target.contains_range(*range) {
+                    std::cmp::Ordering::Equal
+                } else if range.end() < target.start() {
+                    std::cmp::Ordering::Less
+                } else {
+                    std::cmp::Ordering::Greater
+                }
+            })
+            .is_ok()
+    }
+
+    /// Returns the comments who are within the range
+    pub fn comments_in_range(&self, range: TextRange) -> &[TextRange] {
+        let start = self
+            .raw
+            .partition_point(|comment| comment.start() < range.start());
+        // We expect there are few comments, so switching to find should be faster
+        match self.raw[start..]
+            .iter()
+            .find_position(|comment| comment.end() > range.end())
+        {
+            Some((in_range, _element)) => &self.raw[start..start + in_range],
+            None => &self.raw[start..],
+        }
+    }
+}
+
+impl Deref for CommentRanges {
+    type Target = [TextRange];
+
+    fn deref(&self) -> &Self::Target {
+        self.raw.as_slice()
+    }
+}
+
+impl Debug for CommentRanges {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_tuple("CommentRanges").field(&self.raw).finish()
+    }
+}
+
+impl<'a> IntoIterator for &'a CommentRanges {
+    type Item = &'a TextRange;
+    type IntoIter = std::slice::Iter<'a, TextRange>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.raw.iter()
+    }
+}
--- a/crates/ruff_python_trivia/src/cursor.rs
+++ b/crates/ruff_python_trivia/src/cursor.rs
@ -44,7 +44,7 @@ impl<'a> Cursor<'a> {
        self.chars.clone().next_back().unwrap_or(EOF_CHAR)
    }

-    // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
+    // SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
    #[allow(clippy::cast_possible_truncation)]
    pub fn text_len(&self) -> TextSize {
        TextSize::new(self.chars.as_str().len() as u32)
--- a/crates/ruff_python_trivia/src/lib.rs
+++ b/crates/ruff_python_trivia/src/lib.rs
@ -1,8 +1,10 @@
+mod comment_ranges;
 mod cursor;
 pub mod textwrap;
 mod tokenizer;
 mod whitespace;

+pub use comment_ranges::CommentRanges;
 pub use cursor::*;
 pub use tokenizer::*;
 pub use whitespace::*;
--- a/crates/ruff_python_trivia/src/tokenizer.rs
+++ b/crates/ruff_python_trivia/src/tokenizer.rs
@ -1,4 +1,3 @@
-use memchr::{memchr2, memchr3, memrchr3_iter};
 use unicode_ident::{is_xid_continue, is_xid_start};

 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
@ -121,6 +120,47 @@ fn is_identifier_continuation(c: char) -> bool {
    }
 }

+fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
+    match source {
+        "and" => SimpleTokenKind::And,
+        "as" => SimpleTokenKind::As,
+        "assert" => SimpleTokenKind::Assert,
+        "async" => SimpleTokenKind::Async,
+        "await" => SimpleTokenKind::Await,
+        "break" => SimpleTokenKind::Break,
+        "class" => SimpleTokenKind::Class,
+        "continue" => SimpleTokenKind::Continue,
+        "def" => SimpleTokenKind::Def,
+        "del" => SimpleTokenKind::Del,
+        "elif" => SimpleTokenKind::Elif,
+        "else" => SimpleTokenKind::Else,
+        "except" => SimpleTokenKind::Except,
+        "finally" => SimpleTokenKind::Finally,
+        "for" => SimpleTokenKind::For,
+        "from" => SimpleTokenKind::From,
+        "global" => SimpleTokenKind::Global,
+        "if" => SimpleTokenKind::If,
+        "import" => SimpleTokenKind::Import,
+        "in" => SimpleTokenKind::In,
+        "is" => SimpleTokenKind::Is,
+        "lambda" => SimpleTokenKind::Lambda,
+        "nonlocal" => SimpleTokenKind::Nonlocal,
+        "not" => SimpleTokenKind::Not,
+        "or" => SimpleTokenKind::Or,
+        "pass" => SimpleTokenKind::Pass,
+        "raise" => SimpleTokenKind::Raise,
+        "return" => SimpleTokenKind::Return,
+        "try" => SimpleTokenKind::Try,
+        "while" => SimpleTokenKind::While,
+        "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
+        "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
+        "case" => SimpleTokenKind::Case,
+        "with" => SimpleTokenKind::With,
+        "yield" => SimpleTokenKind::Yield,
+        _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+    }
+}
+
 #[derive(Clone, Debug, Eq, PartialEq, Hash)]
 pub struct SimpleToken {
    pub kind: SimpleTokenKind,
@ -421,17 +461,15 @@ impl SimpleTokenKind {
    }
 }

-/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
+/// Simple zero allocation tokenizer handling most tokens.
 ///
 /// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
 ///
-/// The tokenizer doesn't guarantee any correctness after it returned a [`SimpleTokenKind::Other`]. That's why it
-/// will return [`SimpleTokenKind::Bogus`] for every character after until it reaches the end of the file.
+/// In case it finds something it can't parse, the tokenizer will return a
+/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
 pub struct SimpleTokenizer<'a> {
    offset: TextSize,
-    back_offset: TextSize,
    /// `true` when it is known that the current `back` line has no comment for sure.
-    back_line_has_no_comment: bool,
    bogus: bool,
    source: &'a str,
    cursor: Cursor<'a>,
@ -441,8 +479,6 @@ impl<'a> SimpleTokenizer<'a> {
    pub fn new(source: &'a str, range: TextRange) -> Self {
        Self {
            offset: range.start(),
-            back_offset: range.end(),
-            back_line_has_no_comment: false,
            bogus: false,
            source,
            cursor: Cursor::new(&source[range]),
@ -454,64 +490,6 @@ impl<'a> SimpleTokenizer<'a> {
        Self::new(source, range)
    }

-    /// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
-    ///
-    /// Consider using [`SimpleTokenizer::up_to_without_back_comment`] if intend to lex backwards.
-    pub fn up_to(offset: TextSize, source: &'a str) -> Self {
-        Self::new(source, TextRange::up_to(offset))
-    }
-
-    /// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs
-    /// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing
-    /// because the lexer doesn't need to scan for comments.
-    pub fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self {
-        let mut tokenizer = Self::up_to(offset, source);
-        tokenizer.back_line_has_no_comment = true;
-        tokenizer
-    }
-
-    fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
-        let source = &self.source[range];
-        match source {
-            "and" => SimpleTokenKind::And,
-            "as" => SimpleTokenKind::As,
-            "assert" => SimpleTokenKind::Assert,
-            "async" => SimpleTokenKind::Async,
-            "await" => SimpleTokenKind::Await,
-            "break" => SimpleTokenKind::Break,
-            "class" => SimpleTokenKind::Class,
-            "continue" => SimpleTokenKind::Continue,
-            "def" => SimpleTokenKind::Def,
-            "del" => SimpleTokenKind::Del,
-            "elif" => SimpleTokenKind::Elif,
-            "else" => SimpleTokenKind::Else,
-            "except" => SimpleTokenKind::Except,
-            "finally" => SimpleTokenKind::Finally,
-            "for" => SimpleTokenKind::For,
-            "from" => SimpleTokenKind::From,
-            "global" => SimpleTokenKind::Global,
-            "if" => SimpleTokenKind::If,
-            "import" => SimpleTokenKind::Import,
-            "in" => SimpleTokenKind::In,
-            "is" => SimpleTokenKind::Is,
-            "lambda" => SimpleTokenKind::Lambda,
-            "nonlocal" => SimpleTokenKind::Nonlocal,
-            "not" => SimpleTokenKind::Not,
-            "or" => SimpleTokenKind::Or,
-            "pass" => SimpleTokenKind::Pass,
-            "raise" => SimpleTokenKind::Raise,
-            "return" => SimpleTokenKind::Return,
-            "try" => SimpleTokenKind::Try,
-            "while" => SimpleTokenKind::While,
-            "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
-            "type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
-            "case" => SimpleTokenKind::Case,
-            "with" => SimpleTokenKind::With,
-            "yield" => SimpleTokenKind::Yield,
-            _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
-        }
-    }
-
    fn next_token(&mut self) -> SimpleToken {
        self.cursor.start_token();

@ -523,6 +501,7 @@ impl<'a> SimpleTokenizer<'a> {
        };

        if self.bogus {
+            // Emit a single final bogus token
            let token = SimpleToken {
                kind: SimpleTokenKind::Bogus,
                range: TextRange::at(self.offset, first.text_len()),
@ -532,14 +511,29 @@ impl<'a> SimpleTokenizer<'a> {
            return token;
        }

-        let kind = match first {
+        let kind = self.next_token_inner(first);
+
+        let token_len = self.cursor.token_len();
+
+        let token = SimpleToken {
+            kind,
+            range: TextRange::at(self.offset, token_len),
+        };
+
+        self.offset += token_len;
+
+        token
+    }
+
+    fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
+        match first {
            // Keywords and identifiers
            c if is_identifier_start(c) => {
                self.cursor.eat_while(is_identifier_continuation);
                let token_len = self.cursor.token_len();

                let range = TextRange::at(self.offset, token_len);
-                let kind = self.to_keyword_or_other(range);
+                let kind = to_keyword_or_other(&self.source[range]);

                if kind == SimpleTokenKind::Other {
                    self.bogus = true;
@ -717,24 +711,102 @@ impl<'a> SimpleTokenizer<'a> {
                self.bogus = true;
                SimpleTokenKind::Other
            }
-        };
-
-        let token_len = self.cursor.token_len();
-
-        let token = SimpleToken {
-            kind,
-            range: TextRange::at(self.offset, token_len),
-        };
-
-        self.offset += token_len;
-
-        token
+        }
    }

-    /// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
-    /// because it needs to check if the line has any comments when encountering any non-trivia token.
-    pub fn next_token_back(&mut self) -> SimpleToken {
+    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
+        self.filter(|t| !t.kind().is_trivia())
+    }
+}
+
+impl Iterator for SimpleTokenizer<'_> {
+    type Item = SimpleToken;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let token = self.next_token();
+
+        if token.kind == SimpleTokenKind::EndOfFile {
+            None
+        } else {
+            Some(token)
+        }
+    }
+}
+
+/// Simple zero allocation backwards tokenizer for finding preceding tokens.
+///
+/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
+/// It will fail when reaching a string.
+///
+/// In case it finds something it can't parse, the tokenizer will return a
+/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
+pub struct BackwardsTokenizer<'a> {
+    offset: TextSize,
+    back_offset: TextSize,
+    /// Remember if we have check for comments
+    after_newline: bool,
+    /// Not `&CommentRanges` to avoid a circular dependency
+    comment_ranges: &'a [TextRange],
+    /// The index the previously line ending comment
+    previous_comment_idx: Option<usize>,
+    bogus: bool,
+    source: &'a str,
+    cursor: Cursor<'a>,
+}
+
+impl<'a> BackwardsTokenizer<'a> {
+    pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
+        Self {
+            offset: range.start(),
+            back_offset: range.end(),
+            // We could start tokenizing at a comment
+            after_newline: true,
+            comment_ranges: comment_range,
+            previous_comment_idx: None,
+            bogus: false,
+            source,
+            cursor: Cursor::new(&source[range]),
+        }
+    }
+
+    pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
+        Self::new(source, TextRange::up_to(offset), comment_range)
+    }
+
+    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
+        self.filter(|t| !t.kind().is_trivia())
+    }
+
+    pub fn next_token(&mut self) -> SimpleToken {
        self.cursor.start_token();
+        self.back_offset = self.cursor.text_len() + self.offset;
+
+        if self.after_newline {
+            // This comment ended a line with a higher line number, not the current one
+            let previous_comment_idx = self.previous_comment_idx.unwrap_or_else(|| {
+                self.comment_ranges
+                    .partition_point(|comment| comment.end() <= self.back_offset)
+            });
+            // If `previous_comment_idx == 0`, we're in a comment free region
+            if previous_comment_idx > 0 {
+                let comment = self.comment_ranges[previous_comment_idx - 1];
+                if comment.end() == self.back_offset {
+                    // Skip the comment without iterating over the chars manually
+                    self.cursor =
+                        Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
+                    debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
+                    self.after_newline = false;
+                    self.previous_comment_idx = Some(previous_comment_idx - 1);
+                    return SimpleToken {
+                        kind: SimpleTokenKind::Comment,
+                        range: comment.range(),
+                    };
+                }
+                // At least memoize the binary search
+                self.previous_comment_idx = Some(previous_comment_idx);
+            }
+            self.after_newline = false;
+        }

        let Some(last) = self.cursor.bump_back() else {
            return SimpleToken {
@ -762,322 +834,132 @@ impl<'a> SimpleTokenizer<'a> {
            }

            '\r' => {
-                self.back_line_has_no_comment = false;
+                self.after_newline = true;
                SimpleTokenKind::Newline
            }

            '\n' => {
-                self.back_line_has_no_comment = false;
                self.cursor.eat_char_back('\r');
+                self.after_newline = true;
                SimpleTokenKind::Newline
            }
-
-            // Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
-            '#' => SimpleTokenKind::Comment,
-
-            // For all other tokens, test if the character isn't part of a comment.
-            c => {
-                // Skip the test whether there's a preceding comment if it has been performed before.
-                let comment_length = if self.back_line_has_no_comment {
-                    None
-                } else {
-                    let bytes = self.cursor.chars().as_str().as_bytes();
-                    let mut potential_comment_starts: smallvec::SmallVec<[TextSize; 2]> =
-                        smallvec::SmallVec::new();
-
-                    // Find the start of the line, or any potential comments.
-                    for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) {
-                        if bytes[index] == b'#' {
-                            // Potentially a comment, but not guaranteed
-                            // SAFETY: Safe, because ruff only supports files up to 4GB
-                            potential_comment_starts.push(TextSize::try_from(index).unwrap());
-                        } else {
-                            break;
-                        }
-                    }
-
-                    // No comments
-                    if potential_comment_starts.is_empty() {
-                        None
-                    } else {
-                        // The line contains at least one `#` token. The `#` can indicate the start of a
-                        // comment, meaning the current token is commented out, or it is a regular `#` inside of a string.
-                        self.comment_from_hash_positions(&potential_comment_starts)
-                    }
-                };
-
-                // From here on it is guaranteed that this line has no other comment.
-                self.back_line_has_no_comment = true;
-
-                if let Some(comment_length) = comment_length {
-                    // It is a comment, bump all tokens
-                    for _ in 0..usize::from(comment_length) {
-                        self.cursor.bump_back().unwrap();
-                    }
-
-                    SimpleTokenKind::Comment
-                } else {
-                    match c {
-                        // Keywords and identifiers
-                        c if is_identifier_continuation(c) => {
-                            // if we only have identifier continuations but no start (e.g. 555) we
-                            // don't want to consume the chars, so in that case, we want to rewind the
-                            // cursor to here
-                            let savepoint = self.cursor.clone();
-                            self.cursor.eat_back_while(is_identifier_continuation);
-
-                            let token_len = self.cursor.token_len();
-                            let range = TextRange::at(self.back_offset - token_len, token_len);
-
-                            if self.source[range]
-                                .chars()
-                                .next()
-                                .is_some_and(is_identifier_start)
-                            {
-                                self.to_keyword_or_other(range)
-                            } else {
-                                self.cursor = savepoint;
-                                self.bogus = true;
-                                SimpleTokenKind::Other
-                            }
-                        }
-
-                        // Non-trivia tokens that are unambiguous when lexing backwards.
-                        // In other words: these are characters that _don't_ appear at the
-                        // end of a multi-character token (like `!=`).
-                        '\\' => SimpleTokenKind::Continuation,
-                        ':' => SimpleTokenKind::Colon,
-                        '~' => SimpleTokenKind::Tilde,
-                        '%' => SimpleTokenKind::Percent,
-                        '|' => SimpleTokenKind::Vbar,
-                        ',' => SimpleTokenKind::Comma,
-                        ';' => SimpleTokenKind::Semi,
-                        '(' => SimpleTokenKind::LParen,
-                        ')' => SimpleTokenKind::RParen,
-                        '[' => SimpleTokenKind::LBracket,
-                        ']' => SimpleTokenKind::RBracket,
-                        '{' => SimpleTokenKind::LBrace,
-                        '}' => SimpleTokenKind::RBrace,
-                        '&' => SimpleTokenKind::Ampersand,
-                        '^' => SimpleTokenKind::Circumflex,
-                        '+' => SimpleTokenKind::Plus,
-                        '-' => SimpleTokenKind::Minus,
-
-                        // Non-trivia tokens that _are_ ambiguous when lexing backwards.
-                        // In other words: these are characters that _might_ mark the end
-                        // of a multi-character token (like `!=` or `->` or `//` or `**`).
-                        '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
-                            // This could be a single-token token, like `+` in `x + y`, or a
-                            // multi-character token, like `+=` in `x += y`. It could also be a sequence
-                            // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
-                            // important that we produce the same token stream when lexing backwards as
-                            // we do when lexing forwards. So, identify the range of the sequence, lex
-                            // forwards, and return the last token.
-                            let mut cursor = self.cursor.clone();
-                            cursor.eat_back_while(|c| {
-                                matches!(
-                                    c,
-                                    ':' | '~'
-                                        | '%'
-                                        | '|'
-                                        | '&'
-                                        | '^'
-                                        | '+'
-                                        | '-'
-                                        | '='
-                                        | '*'
-                                        | '/'
-                                        | '@'
-                                        | '!'
-                                        | '<'
-                                        | '>'
-                                        | '.'
-                                )
-                            });
-
-                            let token_len = cursor.token_len();
-                            let range = TextRange::at(self.back_offset - token_len, token_len);
-
-                            let forward_lexer = Self::new(self.source, range);
-                            if let Some(token) = forward_lexer.last() {
-                                // If the token spans multiple characters, bump the cursor. Note,
-                                // though, that we already bumped the cursor to past the last character
-                                // in the token at the very start of `next_token_back`.
-                                for _ in self.source[token.range].chars().rev().skip(1) {
-                                    self.cursor.bump_back().unwrap();
-                                }
-                                token.kind()
-                            } else {
-                                self.bogus = true;
-                                SimpleTokenKind::Other
-                            }
-                        }
-
-                        _ => {
-                            self.bogus = true;
-                            SimpleTokenKind::Other
-                        }
-                    }
-                }
-            }
+            _ => self.next_token_inner(last),
        };

        let token_len = self.cursor.token_len();
-
        let start = self.back_offset - token_len;
-
-        let token = SimpleToken {
+        SimpleToken {
            kind,
            range: TextRange::at(start, token_len),
-        };
-
-        self.back_offset = start;
-
-        token
-    }
-
-    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + DoubleEndedIterator + 'a {
-        self.filter(|t| !t.kind().is_trivia())
-    }
-
-    /// Given the position of `#` tokens on a line, test if any `#` is the start of a comment and, if so, return the
-    /// length of the comment.
-    ///
-    /// The challenge is that `#` tokens can also appear inside of strings:
-    ///
-    /// ```python
-    /// ' #not a comment'
-    /// ```
-    ///
-    /// This looks innocent but is the `'` really the start of the new string or could it be a closing delimiter
-    /// of a previously started string:
-    ///
-    /// ```python
-    /// ' a string\
-    /// ` # a comment '
-    /// ```
-    ///
-    /// The only way to reliability tell whether the `#` is a comment when the comment contains a quote char is
-    /// to forward lex all strings and comments and test if there's any unclosed string literal. If so, then
-    /// the hash cannot be a comment.
-    fn comment_from_hash_positions(&self, hash_positions: &[TextSize]) -> Option<TextSize> {
-        // Iterate over the `#` positions from the start to the end of the line.
-        // This is necessary to correctly support `a # comment # comment`.
-        for possible_start in hash_positions.iter().rev() {
-            let comment_bytes =
-                self.source[TextRange::new(*possible_start, self.back_offset)].as_bytes();
-
-            // Test if the comment contains any quotes. If so, then it's possible that the `#` token isn't
-            // the start of a comment, but instead part of a string:
-            // ```python
-            // a + 'a string # not a comment'
-            // a + '''a string
-            // # not a comment'''
-            // ```
-            match memchr2(b'\'', b'"', comment_bytes) {
-                // Most comments don't contain quotes, and most strings don't contain comments.
-                // For these it's safe to assume that they are comments.
-                None => return Some(self.cursor.chars().as_str().text_len() - possible_start),
-                // Now it gets complicated... There's no good way to know whether this is a string or not.
-                // It is necessary to lex all strings and comments from the start to know if it is one or the other.
-                Some(_) => {
-                    if find_unterminated_string_kind(
-                        &self.cursor.chars().as_str()[TextRange::up_to(*possible_start)],
-                    )
-                    .is_none()
-                    {
-                        // There's no unterminated string at the comment's start position. This *must*
-                        // be a comment.
-                        return Some(self.cursor.chars().as_str().text_len() - possible_start);
-                    }
-
-                    // This is a hash inside of a string: `'test # not a comment'` continue with the next potential comment on the line.
-                }
-            }
-        }
-
-        None
-    }
-}
-
-fn find_unterminated_string_kind(input: &str) -> Option<StringKind> {
-    let mut rest = input;
-
-    while let Some(comment_or_string_start) = memchr3(b'#', b'\'', b'\"', rest.as_bytes()) {
-        let c = rest.as_bytes()[comment_or_string_start] as char;
-        let after = &rest[comment_or_string_start + 1..];
-
-        if c == '#' {
-            let comment_end = memchr2(b'\n', b'\r', after.as_bytes()).unwrap_or(after.len());
-            rest = &after[comment_end..];
-        } else {
-            let mut cursor = Cursor::new(after);
-            let quote_kind = if c == '\'' {
-                QuoteKind::Single
-            } else {
-                QuoteKind::Double
-            };
-
-            let string_kind = if cursor.eat_char(quote_kind.as_char()) {
-                // `''` or `""`
-                if cursor.eat_char(quote_kind.as_char()) {
-                    // `'''` or `"""`
-                    StringKind::Triple(quote_kind)
-                } else {
-                    // empty string literal, nothing more to lex
-                    rest = cursor.chars().as_str();
-                    continue;
-                }
-            } else {
-                StringKind::Single(quote_kind)
-            };
-
-            if !is_string_terminated(string_kind, &mut cursor) {
-                return Some(string_kind);
-            }
-
-            rest = cursor.chars().as_str();
        }
    }

-    None
-}
+    /// Helper to parser the previous token once we skipped all whitespace
+    fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
+        match last {
+            // Keywords and identifiers
+            c if is_identifier_continuation(c) => {
+                // if we only have identifier continuations but no start (e.g. 555) we
+                // don't want to consume the chars, so in that case, we want to rewind the
+                // cursor to here
+                let savepoint = self.cursor.clone();
+                self.cursor.eat_back_while(is_identifier_continuation);

-fn is_string_terminated(kind: StringKind, cursor: &mut Cursor) -> bool {
-    let quote_char = kind.quote_kind().as_char();
+                let token_len = self.cursor.token_len();
+                let range = TextRange::at(self.back_offset - token_len, token_len);

-    while let Some(c) = cursor.bump() {
-        match c {
-            '\n' | '\r' if kind.is_single() => {
-                // Reached the end of the line without a closing quote, this is an unterminated string literal.
-                return false;
-            }
-            '\\' => {
-                // Skip over escaped quotes that match this strings quotes or double escaped backslashes
-                if cursor.eat_char(quote_char) || cursor.eat_char('\\') {
-                    continue;
-                }
-                // Eat over line continuation
-                cursor.eat_char('\r');
-                cursor.eat_char('\n');
-            }
-            c if c == quote_char => {
-                if kind.is_single() || (cursor.eat_char(quote_char) && cursor.eat_char(quote_char))
+                if self.source[range]
+                    .chars()
+                    .next()
+                    .is_some_and(is_identifier_start)
                {
-                    return true;
+                    to_keyword_or_other(&self.source[range])
+                } else {
+                    self.cursor = savepoint;
+                    self.bogus = true;
+                    SimpleTokenKind::Other
+                }
+            }
+
+            // Non-trivia tokens that are unambiguous when lexing backwards.
+            // In other words: these are characters that _don't_ appear at the
+            // end of a multi-character token (like `!=`).
+            '\\' => SimpleTokenKind::Continuation,
+            ':' => SimpleTokenKind::Colon,
+            '~' => SimpleTokenKind::Tilde,
+            '%' => SimpleTokenKind::Percent,
+            '|' => SimpleTokenKind::Vbar,
+            ',' => SimpleTokenKind::Comma,
+            ';' => SimpleTokenKind::Semi,
+            '(' => SimpleTokenKind::LParen,
+            ')' => SimpleTokenKind::RParen,
+            '[' => SimpleTokenKind::LBracket,
+            ']' => SimpleTokenKind::RBracket,
+            '{' => SimpleTokenKind::LBrace,
+            '}' => SimpleTokenKind::RBrace,
+            '&' => SimpleTokenKind::Ampersand,
+            '^' => SimpleTokenKind::Circumflex,
+            '+' => SimpleTokenKind::Plus,
+            '-' => SimpleTokenKind::Minus,
+
+            // Non-trivia tokens that _are_ ambiguous when lexing backwards.
+            // In other words: these are characters that _might_ mark the end
+            // of a multi-character token (like `!=` or `->` or `//` or `**`).
+            '=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
+                // This could be a single-token token, like `+` in `x + y`, or a
+                // multi-character token, like `+=` in `x += y`. It could also be a sequence
+                // of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
+                // important that we produce the same token stream when lexing backwards as
+                // we do when lexing forwards. So, identify the range of the sequence, lex
+                // forwards, and return the last token.
+                let mut cursor = self.cursor.clone();
+                cursor.eat_back_while(|c| {
+                    matches!(
+                        c,
+                        ':' | '~'
+                            | '%'
+                            | '|'
+                            | '&'
+                            | '^'
+                            | '+'
+                            | '-'
+                            | '='
+                            | '*'
+                            | '/'
+                            | '@'
+                            | '!'
+                            | '<'
+                            | '>'
+                            | '.'
+                    )
+                });
+
+                let token_len = cursor.token_len();
+                let range = TextRange::at(self.back_offset - token_len, token_len);
+
+                let forward_lexer = SimpleTokenizer::new(self.source, range);
+                if let Some(token) = forward_lexer.last() {
+                    // If the token spans multiple characters, bump the cursor. Note,
+                    // though, that we already bumped the cursor to past the last character
+                    // in the token at the very start of `next_token_back`.y
+                    for _ in self.source[token.range].chars().rev().skip(1) {
+                        self.cursor.bump_back().unwrap();
+                    }
+                    token.kind()
+                } else {
+                    self.bogus = true;
+                    SimpleTokenKind::Other
                }
            }
            _ => {
-                // continue
+                self.bogus = true;
+                SimpleTokenKind::Other
            }
        }
    }
-
-    // Reached end without a closing quote
-    false
 }

-impl Iterator for SimpleTokenizer<'_> {
+impl Iterator for BackwardsTokenizer<'_> {
    type Item = SimpleToken;

    fn next(&mut self) -> Option<Self::Item> {
@ -1091,64 +973,16 @@ impl Iterator for SimpleTokenizer<'_> {
    }
 }

-impl DoubleEndedIterator for SimpleTokenizer<'_> {
-    fn next_back(&mut self) -> Option<Self::Item> {
-        let token = self.next_token_back();
-
-        if token.kind == SimpleTokenKind::EndOfFile {
-            None
-        } else {
-            Some(token)
-        }
-    }
-}
-
-#[derive(Copy, Clone, Eq, PartialEq, Debug)]
-enum StringKind {
-    /// `'...'` or `"..."`
-    Single(QuoteKind),
-    /// `'''...'''` or `"""..."""`
-    Triple(QuoteKind),
-}
-
-impl StringKind {
-    const fn quote_kind(self) -> QuoteKind {
-        match self {
-            StringKind::Single(kind) => kind,
-            StringKind::Triple(kind) => kind,
-        }
-    }
-
-    const fn is_single(self) -> bool {
-        matches!(self, StringKind::Single(_))
-    }
-}
-
-#[derive(Copy, Clone, Eq, PartialEq, Debug)]
-enum QuoteKind {
-    /// `'``
-    Single,
-
-    /// `"`
-    Double,
-}
-
-impl QuoteKind {
-    const fn as_char(self) -> char {
-        match self {
-            QuoteKind::Single => '\'',
-            QuoteKind::Double => '"',
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use insta::assert_debug_snapshot;

+    use ruff_python_parser::lexer::lex;
+    use ruff_python_parser::{Mode, Tok};
    use ruff_text_size::{TextLen, TextRange, TextSize};

    use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
+    use crate::{BackwardsTokenizer, SimpleTokenKind};

    struct TokenizationTestCase {
        source: &'static str,
@ -1167,9 +1001,17 @@ mod tests {
        }

        fn tokenize_reverse(&self) -> Vec<SimpleToken> {
-            SimpleTokenizer::new(self.source, self.range)
-                .rev()
-                .collect()
+            let comment_ranges: Vec<_> = lex(self.source, Mode::Module)
+                .filter_map(|result| {
+                    let (token, range) = result.expect("Input to be a valid python program.");
+                    if matches!(token, Tok::Comment(_)) {
+                        Some(range)
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+            BackwardsTokenizer::new(self.source, self.range, &comment_ranges).collect()
        }

        fn tokens(&self) -> &[SimpleToken] {
@ -1495,4 +1337,22 @@ mod tests {
            1
        );
    }
+
+    #[test]
+    fn test_previous_token_simple() {
+        let cases = &["x = (", "x = ( ", "x = (\n"];
+        for source in cases {
+            let token = BackwardsTokenizer::up_to(source.text_len(), source, &[])
+                .skip_trivia()
+                .next()
+                .unwrap();
+            assert_eq!(
+                token,
+                SimpleToken {
+                    kind: SimpleTokenKind::LParen,
+                    range: TextRange::new(TextSize::new(4), TextSize::new(5)),
+                }
+            );
+        }
+    }
 }