Modify comment_ranges slice in BackwardsTokenizer (#7432)

## Summary I was kinda curious to understand this issue (https://github.com/astral-sh/ruff/issues/7426) and just ended up attempting to address it. ## Test Plan `cargo test`
2025-09-29 13:24:57 +00:00 · 2023-09-16 14:04:45 -04:00 · 2023-09-16 14:04:45 -04:00 · 8d0a5e01bd
commit 8d0a5e01bd
parent aae02cf275
1 changed files with 21 additions and 42 deletions
--- a/crates/ruff_python_trivia/src/tokenizer.rs
+++ b/crates/ruff_python_trivia/src/tokenizer.rs
@ -743,12 +743,8 @@ impl Iterator for SimpleTokenizer<'_> {
 pub struct BackwardsTokenizer<'a> {
    offset: TextSize,
    back_offset: TextSize,
-    /// Remember if we have check for comments
+    /// Not `&CommentRanges` to avoid a circular dependency.
    after_newline: bool,
    /// Not `&CommentRanges` to avoid a circular dependency
    comment_ranges: &'a [TextRange],
    /// The index the previously line ending comment
    previous_comment_idx: Option<usize>,
    bogus: bool,
    source: &'a str,
    cursor: Cursor<'a>,
@ -759,10 +755,9 @@ impl<'a> BackwardsTokenizer<'a> {
        Self {
            offset: range.start(),
            back_offset: range.end(),
-            // We could start tokenizing at a comment
+            // Throw out any comments that follow the range.
-            after_newline: true,
+            comment_ranges: &comment_range
-            comment_ranges: comment_range,
+                [..comment_range.partition_point(|comment| comment.start() <= range.end())],
            previous_comment_idx: None,
            bogus: false,
            source,
            cursor: Cursor::new(&source[range]),
@ -781,33 +776,6 @@ impl<'a> BackwardsTokenizer<'a> {
        self.cursor.start_token();
        self.back_offset = self.cursor.text_len() + self.offset;
        if self.after_newline {
            // This comment ended a line with a higher line number, not the current one
            let previous_comment_idx = self.previous_comment_idx.unwrap_or_else(|| {
                self.comment_ranges
                    .partition_point(|comment| comment.end() <= self.back_offset)
            });
            // If `previous_comment_idx == 0`, we're in a comment free region
            if previous_comment_idx > 0 {
                let comment = self.comment_ranges[previous_comment_idx - 1];
                if comment.end() == self.back_offset {
                    // Skip the comment without iterating over the chars manually
                    self.cursor =
                        Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
                    debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
                    self.after_newline = false;
                    self.previous_comment_idx = Some(previous_comment_idx - 1);
                    return SimpleToken {
                        kind: SimpleTokenKind::Comment,
                        range: comment.range(),
                    };
                }
                // At least memoize the binary search
                self.previous_comment_idx = Some(previous_comment_idx);
            }
            self.after_newline = false;
        }
        let Some(last) = self.cursor.bump_back() else {
            return SimpleToken {
                kind: SimpleTokenKind::EndOfFile,
@ -825,6 +793,22 @@ impl<'a> BackwardsTokenizer<'a> {
            return token;
        }
        if let Some(comment) = self
            .comment_ranges
            .last()
            .filter(|comment| comment.contains_inclusive(self.back_offset))
        {
            self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
            // Skip the comment without iterating over the chars manually.
            self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
            debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
            return SimpleToken {
                kind: SimpleTokenKind::Comment,
                range: comment.range(),
            };
        }
        let kind = match last {
            // This may not be 100% correct because it will lex-out trailing whitespace from a comment
            // as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
@ -833,14 +817,9 @@ impl<'a> BackwardsTokenizer<'a> {
                SimpleTokenKind::Whitespace
            }
-            '\r' => {
+            '\r' => SimpleTokenKind::Newline,
                self.after_newline = true;
                SimpleTokenKind::Newline
            }
            '\n' => {
                self.cursor.eat_char_back('\r');
                self.after_newline = true;
                SimpleTokenKind::Newline
            }
            _ => self.next_token_inner(last),