Simple lexer for formatter (#4922)

2025-12-15 21:45:30 +00:00 · 2023-06-08 17:37:39 +02:00 · 2023-06-08 17:37:39 +02:00 · 9c3fb23ace
commit 9c3fb23ace
parent 467df23e65
11 changed files with 1152 additions and 189 deletions
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@ -1,11 +1,11 @@
 use crate::comments::visitor::{CommentPlacement, DecoratedComment};
 use crate::comments::CommentTextPosition;
-use crate::trivia::find_first_non_trivia_character_in_range;
+use crate::trivia::{SimpleTokenizer, TokenKind};
 use ruff_newlines::StrExt;
 use ruff_python_ast::node::AnyNodeRef;
 use ruff_python_ast::source_code::Locator;
 use ruff_python_ast::whitespace;
-use ruff_text_size::{TextLen, TextRange, TextSize};
+use ruff_text_size::{TextRange, TextSize};
 use rustpython_parser::ast::Ranged;
 use std::cmp::Ordering;

@ -521,14 +521,16 @@ fn handle_trailing_end_of_line_condition_comment<'a>(
    // If the preceding is the node before the `colon`
    // `while true:` The node before the `colon` is the `true` constant.
    if preceding.ptr_eq(last_before_colon) {
-        let mut start = preceding.end();
-        while let Some((offset, c)) = find_first_non_trivia_character_in_range(
-            TextRange::new(start, following.start()),
+        let tokens = SimpleTokenizer::new(
            locator.contents(),
-        ) {
-            match c {
-                ':' => {
-                    if comment.slice().start() > offset {
+            TextRange::new(preceding.end(), following.start()),
+        )
+        .skip_trivia();
+
+        for token in tokens {
+            match token.kind() {
+                TokenKind::Colon => {
+                    if comment.slice().start() > token.start() {
                        // Comment comes after the colon
                        // ```python
                        // while a: # comment
@ -546,9 +548,8 @@ fn handle_trailing_end_of_line_condition_comment<'a>(
                    // ```
                    break;
                }
-                ')' => {
+                TokenKind::RParen => {
                    // Skip over any closing parentheses
-                    start = offset + ')'.text_len();
                }
                _ => {
                    unreachable!("Only ')' or ':' should follow the condition")
@ -652,21 +653,17 @@ fn handle_trailing_binary_expression_left_or_operator_comment<'a>(
        return CommentPlacement::Default(comment);
    }

-    let mut between_operands_range = TextRange::new(
+    let between_operands_range = TextRange::new(
        binary_expression.left.end(),
        binary_expression.right.start(),
    );

-    let operator_offset = loop {
-        match find_first_non_trivia_character_in_range(between_operands_range, locator.contents()) {
-            // Skip over closing parens
-            Some((offset, ')')) => {
-                between_operands_range =
-                    TextRange::new(offset + TextSize::new(1), between_operands_range.end());
-            }
-            Some((offset, _)) => break offset,
-            None => return CommentPlacement::Default(comment),
-        }
+    let mut tokens = SimpleTokenizer::new(locator.contents(), between_operands_range).skip_trivia();
+    let operator_offset = if let Some(non_r_paren) = tokens.find(|t| t.kind() != TokenKind::RParen)
+    {
+        non_r_paren.start()
+    } else {
+        return CommentPlacement::Default(comment);
    };

    let comment_range = comment.slice().range();
@ -805,29 +802,22 @@ fn find_pos_only_slash_offset(
    between_arguments_range: TextRange,
    locator: &Locator,
 ) -> Option<TextSize> {
-    // First find the comma separating the two arguments
-    find_first_non_trivia_character_in_range(between_arguments_range, locator.contents()).and_then(
-        |(comma_offset, comma)| {
-            debug_assert_eq!(comma, ',');
+    let mut tokens =
+        SimpleTokenizer::new(locator.contents(), between_arguments_range).skip_trivia();

-            // Then find the position of the `/` operator
-            find_first_non_trivia_character_in_range(
-                TextRange::new(
-                    comma_offset + TextSize::new(1),
-                    between_arguments_range.end(),
-                ),
-                locator.contents(),
-            )
-            .and_then(|(offset, c)| {
-                if c == '/' {
-                    Some(offset)
-                } else {
-                    debug_assert_eq!(c, ')');
-                    None
-                }
-            })
-        },
-    )
+    if let Some(comma) = tokens.next() {
+        debug_assert_eq!(comma.kind(), TokenKind::Comma);
+
+        if let Some(maybe_slash) = tokens.next() {
+            if maybe_slash.kind() == TokenKind::Slash {
+                return Some(maybe_slash.start());
+            }
+
+            debug_assert_eq!(maybe_slash.kind(), TokenKind::RParen);
+        }
+    }
+
+    None
 }

 /// Returns `true` if `right` is `Some` and `left` and `right` are referentially equal.
--- a/crates/ruff_python_formatter/src/expression/parentheses.rs
+++ b/crates/ruff_python_formatter/src/expression/parentheses.rs
@ -1,7 +1,6 @@
-use crate::trivia::{
-    find_first_non_trivia_character_after, find_first_non_trivia_character_before,
-};
+use crate::trivia::{first_non_trivia_token, first_non_trivia_token_rev, Token, TokenKind};
 use ruff_python_ast::node::AnyNodeRef;
+use rustpython_parser::ast::Ranged;

 pub(crate) trait NeedsParentheses {
    fn needs_parentheses(&self, parenthesize: Parenthesize, source: &str) -> Parentheses;
@ -73,21 +72,17 @@ pub enum Parentheses {
 }

 fn is_expression_parenthesized(expr: AnyNodeRef, contents: &str) -> bool {
-    use rustpython_parser::ast::Ranged;
-
-    debug_assert!(
-        expr.is_expression(),
-        "Should only be called for expressions"
-    );
-
-    // Search backwards to avoid ambiguity with `(a, )` and because it's faster
    matches!(
-        find_first_non_trivia_character_after(expr.end(), contents),
-        Some((_, ')'))
-    )
-        // Search forwards to confirm that this is not a nested expression `(5 + d * 3)`
-        && matches!(
-        find_first_non_trivia_character_before(expr.start(), contents),
-        Some((_, '('))
+        first_non_trivia_token(expr.end(), contents),
+        Some(Token {
+            kind: TokenKind::RParen,
+            ..
+        })
+    ) && matches!(
+        first_non_trivia_token_rev(expr.start(), contents),
+        Some(Token {
+            kind: TokenKind::LParen,
+            ..
+        })
    )
 }
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__Reverse.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__Reverse.snap
@ -0,0 +1,218 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokenize_reverse()
+---
+[
+    Token {
+        kind: RParen,
+        range: 52..53,
+    },
+    Token {
+        kind: Other,
+        range: 51..52,
+    },
+    Token {
+        kind: Bogus,
+        range: 50..51,
+    },
+    Token {
+        kind: Bogus,
+        range: 49..50,
+    },
+    Token {
+        kind: Bogus,
+        range: 48..49,
+    },
+    Token {
+        kind: Bogus,
+        range: 47..48,
+    },
+    Token {
+        kind: Bogus,
+        range: 46..47,
+    },
+    Token {
+        kind: Bogus,
+        range: 45..46,
+    },
+    Token {
+        kind: Bogus,
+        range: 44..45,
+    },
+    Token {
+        kind: Bogus,
+        range: 43..44,
+    },
+    Token {
+        kind: Bogus,
+        range: 42..43,
+    },
+    Token {
+        kind: Bogus,
+        range: 41..42,
+    },
+    Token {
+        kind: Bogus,
+        range: 40..41,
+    },
+    Token {
+        kind: Bogus,
+        range: 39..40,
+    },
+    Token {
+        kind: Bogus,
+        range: 38..39,
+    },
+    Token {
+        kind: Bogus,
+        range: 37..38,
+    },
+    Token {
+        kind: Bogus,
+        range: 36..37,
+    },
+    Token {
+        kind: Bogus,
+        range: 35..36,
+    },
+    Token {
+        kind: Bogus,
+        range: 34..35,
+    },
+    Token {
+        kind: Bogus,
+        range: 33..34,
+    },
+    Token {
+        kind: Bogus,
+        range: 32..33,
+    },
+    Token {
+        kind: Bogus,
+        range: 31..32,
+    },
+    Token {
+        kind: Bogus,
+        range: 30..31,
+    },
+    Token {
+        kind: Bogus,
+        range: 29..30,
+    },
+    Token {
+        kind: Bogus,
+        range: 28..29,
+    },
+    Token {
+        kind: Bogus,
+        range: 27..28,
+    },
+    Token {
+        kind: Bogus,
+        range: 26..27,
+    },
+    Token {
+        kind: Bogus,
+        range: 25..26,
+    },
+    Token {
+        kind: Bogus,
+        range: 24..25,
+    },
+    Token {
+        kind: Bogus,
+        range: 23..24,
+    },
+    Token {
+        kind: Bogus,
+        range: 22..23,
+    },
+    Token {
+        kind: Bogus,
+        range: 21..22,
+    },
+    Token {
+        kind: Bogus,
+        range: 20..21,
+    },
+    Token {
+        kind: Bogus,
+        range: 19..20,
+    },
+    Token {
+        kind: Bogus,
+        range: 18..19,
+    },
+    Token {
+        kind: Bogus,
+        range: 17..18,
+    },
+    Token {
+        kind: Bogus,
+        range: 16..17,
+    },
+    Token {
+        kind: Bogus,
+        range: 15..16,
+    },
+    Token {
+        kind: Bogus,
+        range: 14..15,
+    },
+    Token {
+        kind: Bogus,
+        range: 13..14,
+    },
+    Token {
+        kind: Bogus,
+        range: 12..13,
+    },
+    Token {
+        kind: Bogus,
+        range: 11..12,
+    },
+    Token {
+        kind: Bogus,
+        range: 10..11,
+    },
+    Token {
+        kind: Bogus,
+        range: 9..10,
+    },
+    Token {
+        kind: Bogus,
+        range: 8..9,
+    },
+    Token {
+        kind: Bogus,
+        range: 7..8,
+    },
+    Token {
+        kind: Bogus,
+        range: 6..7,
+    },
+    Token {
+        kind: Bogus,
+        range: 5..6,
+    },
+    Token {
+        kind: Bogus,
+        range: 4..5,
+    },
+    Token {
+        kind: Bogus,
+        range: 3..4,
+    },
+    Token {
+        kind: Bogus,
+        range: 2..3,
+    },
+    Token {
+        kind: Bogus,
+        range: 1..2,
+    },
+    Token {
+        kind: Bogus,
+        range: 0..1,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_bogus.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_bogus.snap
@ -0,0 +1,126 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Comment,
+        range: 0..17,
+    },
+    Token {
+        kind: Newline,
+        range: 17..18,
+    },
+    Token {
+        kind: Whitespace,
+        range: 18..26,
+    },
+    Token {
+        kind: Other,
+        range: 26..27,
+    },
+    Token {
+        kind: Bogus,
+        range: 27..28,
+    },
+    Token {
+        kind: Bogus,
+        range: 28..29,
+    },
+    Token {
+        kind: Bogus,
+        range: 29..30,
+    },
+    Token {
+        kind: Bogus,
+        range: 30..31,
+    },
+    Token {
+        kind: Bogus,
+        range: 31..32,
+    },
+    Token {
+        kind: Bogus,
+        range: 32..33,
+    },
+    Token {
+        kind: Bogus,
+        range: 33..34,
+    },
+    Token {
+        kind: Bogus,
+        range: 34..35,
+    },
+    Token {
+        kind: Bogus,
+        range: 35..36,
+    },
+    Token {
+        kind: Bogus,
+        range: 36..37,
+    },
+    Token {
+        kind: Bogus,
+        range: 37..38,
+    },
+    Token {
+        kind: Bogus,
+        range: 38..39,
+    },
+    Token {
+        kind: Bogus,
+        range: 39..40,
+    },
+    Token {
+        kind: Bogus,
+        range: 40..41,
+    },
+    Token {
+        kind: Bogus,
+        range: 41..42,
+    },
+    Token {
+        kind: Bogus,
+        range: 42..43,
+    },
+    Token {
+        kind: Bogus,
+        range: 43..44,
+    },
+    Token {
+        kind: Bogus,
+        range: 44..45,
+    },
+    Token {
+        kind: Bogus,
+        range: 45..46,
+    },
+    Token {
+        kind: Bogus,
+        range: 46..47,
+    },
+    Token {
+        kind: Bogus,
+        range: 47..48,
+    },
+    Token {
+        kind: Bogus,
+        range: 48..49,
+    },
+    Token {
+        kind: Bogus,
+        range: 49..50,
+    },
+    Token {
+        kind: Bogus,
+        range: 50..51,
+    },
+    Token {
+        kind: Bogus,
+        range: 51..52,
+    },
+    Token {
+        kind: Bogus,
+        range: 52..53,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_comma.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_comma.snap
@ -0,0 +1,22 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: tokens
+---
+[
+    Token {
+        kind: Comma,
+        range: 0..1,
+    },
+    Token {
+        kind: Comma,
+        range: 1..2,
+    },
+    Token {
+        kind: Comma,
+        range: 2..3,
+    },
+    Token {
+        kind: Comma,
+        range: 3..4,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_continuation.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_continuation.snap
@ -0,0 +1,30 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: tokens
+---
+[
+    Token {
+        kind: LParen,
+        range: 0..1,
+    },
+    Token {
+        kind: Whitespace,
+        range: 1..2,
+    },
+    Token {
+        kind: Continuation,
+        range: 2..3,
+    },
+    Token {
+        kind: Newline,
+        range: 3..4,
+    },
+    Token {
+        kind: Whitespace,
+        range: 4..5,
+    },
+    Token {
+        kind: RParen,
+        range: 5..6,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_parentheses.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_parentheses.snap
@ -0,0 +1,30 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: tokens
+---
+[
+    Token {
+        kind: LParen,
+        range: 0..1,
+    },
+    Token {
+        kind: LBracket,
+        range: 1..2,
+    },
+    Token {
+        kind: LBrace,
+        range: 2..3,
+    },
+    Token {
+        kind: RBrace,
+        range: 3..4,
+    },
+    Token {
+        kind: RBracket,
+        range: 4..5,
+    },
+    Token {
+        kind: RParen,
+        range: 5..6,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_slash.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_slash.snap
@ -0,0 +1,42 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Whitespace,
+        range: 0..1,
+    },
+    Token {
+        kind: Comment,
+        range: 1..30,
+    },
+    Token {
+        kind: Newline,
+        range: 30..31,
+    },
+    Token {
+        kind: Whitespace,
+        range: 31..39,
+    },
+    Token {
+        kind: Comment,
+        range: 39..77,
+    },
+    Token {
+        kind: Newline,
+        range: 77..78,
+    },
+    Token {
+        kind: Whitespace,
+        range: 78..86,
+    },
+    Token {
+        kind: Comma,
+        range: 86..87,
+    },
+    Token {
+        kind: Slash,
+        range: 87..88,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_substring.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_substring.snap
@ -0,0 +1,18 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: tokens
+---
+[
+    Token {
+        kind: RParen,
+        range: 14..15,
+    },
+    Token {
+        kind: Whitespace,
+        range: 15..16,
+    },
+    Token {
+        kind: Comment,
+        range: 16..25,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_trivia.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_trivia.snap
@ -0,0 +1,22 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: tokens
+---
+[
+    Token {
+        kind: Comment,
+        range: 0..9,
+    },
+    Token {
+        kind: Newline,
+        range: 9..10,
+    },
+    Token {
+        kind: Whitespace,
+        range: 10..14,
+    },
+    Token {
+        kind: Comment,
+        range: 14..23,
+    },
+]
--- a/crates/ruff_python_formatter/src/trivia.rs
+++ b/crates/ruff_python_formatter/src/trivia.rs
@ -1,5 +1,6 @@
 use ruff_python_ast::whitespace::is_python_whitespace;
 use ruff_text_size::{TextLen, TextRange, TextSize};
+use std::str::Chars;

 /// Searches for the first non-trivia character in `range`.
 ///
@ -9,113 +10,40 @@ use ruff_text_size::{TextLen, TextRange, TextSize};
 /// of the character, the second item the non-trivia character.
 ///
 /// Returns `None` if the range is empty or only contains trivia (whitespace or comments).
-pub(crate) fn find_first_non_trivia_character_in_range(
-    range: TextRange,
-    code: &str,
-) -> Option<(TextSize, char)> {
-    let rest = &code[range];
-    let mut char_iter = rest.chars();
-
-    while let Some(c) = char_iter.next() {
-        match c {
-            '#' => {
-                // We're now inside of a comment. Skip all content until the end of the line
-                for c in char_iter.by_ref() {
-                    if matches!(c, '\n' | '\r') {
-                        break;
-                    }
-                }
-            }
-            c => {
-                if !is_python_whitespace(c) {
-                    let index = range.start() + rest.text_len()
-                        - char_iter.as_str().text_len()
-                        - c.text_len();
-                    return Some((index, c));
-                }
-            }
-        }
-    }
-
-    None
+pub(crate) fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<Token> {
+    SimpleTokenizer::starts_at(offset, code)
+        .skip_trivia()
+        .next()
 }

-pub(crate) fn find_first_non_trivia_character_after(
-    offset: TextSize,
-    code: &str,
-) -> Option<(TextSize, char)> {
-    find_first_non_trivia_character_in_range(TextRange::new(offset, code.text_len()), code)
-}
-
-pub(crate) fn find_first_non_trivia_character_before(
-    offset: TextSize,
-    code: &str,
-) -> Option<(TextSize, char)> {
-    let head = &code[TextRange::up_to(offset)];
-    let mut char_iter = head.chars();
-
-    while let Some(c) = char_iter.next_back() {
-        match c {
-            c if is_python_whitespace(c) => {
-                continue;
-            }
-
-            // Empty comment
-            '#' => continue,
-
-            non_trivia_character => {
-                // Non trivia character but we don't know if it is a comment or not. Consume all characters
-                // until the start of the line and track if the last non-whitespace character was a `#`.
-                let mut is_comment = false;
-
-                let first_non_trivia_offset = char_iter.as_str().text_len();
-
-                while let Some(c) = char_iter.next_back() {
-                    match c {
-                        '#' => {
-                            is_comment = true;
-                        }
-                        '\n' | '\r' => {
-                            if !is_comment {
-                                return Some((first_non_trivia_offset, non_trivia_character));
-                            }
-                        }
-
-                        c => {
-                            if !is_python_whitespace(c) {
-                                is_comment = false;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    None
+/// Returns the first non-trivia token right before `offset` or `None` if at the start of the file
+/// or all preceding tokens are trivia tokens.
+///
+/// ## Notes
+///
+/// Prefer [`first_non_trivia_token`] whenever possible because reverse lookup is expensive because of comments.
+pub(crate) fn first_non_trivia_token_rev(offset: TextSize, code: &str) -> Option<Token> {
+    SimpleTokenizer::up_to(offset, code)
+        .skip_trivia()
+        .next_back()
 }

 /// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
 pub(crate) fn lines_before(offset: TextSize, code: &str) -> u32 {
-    let head = &code[TextRange::up_to(offset)];
+    let tokens = SimpleTokenizer::up_to(offset, code);
    let mut newlines = 0u32;

-    for (index, c) in head.char_indices().rev() {
-        match c {
-            '\n' => {
-                if head.as_bytes()[index.saturating_sub(1)] == b'\r' {
-                    continue;
-                }
+    for token in tokens.rev() {
+        match token.kind() {
+            TokenKind::Newline => {
                newlines += 1;
            }
-
-            '\r' => {
-                newlines += 1;
+            TokenKind::Whitespace => {
+                // ignore
+            }
+            _ => {
+                break;
            }
-
-            c if is_python_whitespace(c) => continue,
-
-            _ => break,
        }
    }

@ -124,22 +52,20 @@ pub(crate) fn lines_before(offset: TextSize, code: &str) -> u32 {

 /// Counts the empty lines between `offset` and the first non-whitespace character.
 pub(crate) fn lines_after(offset: TextSize, code: &str) -> u32 {
-    let rest = &code[usize::from(offset)..];
-    let mut newlines = 0;
+    let tokens = SimpleTokenizer::starts_at(offset, code);
+    let mut newlines = 0u32;

-    for (index, c) in rest.char_indices() {
-        match c {
-            '\n' => {
+    for token in tokens {
+        match token.kind() {
+            TokenKind::Newline => {
                newlines += 1;
            }
-            '\r' if rest.as_bytes().get(index + 1).copied() == Some(b'\n') => {
-                continue;
+            TokenKind::Whitespace => {
+                // ignore
            }
-            '\r' => {
-                newlines += 1;
+            _ => {
+                break;
            }
-            c if is_python_whitespace(c) => continue,
-            _ => break,
        }
    }

@ -148,35 +74,579 @@ pub(crate) fn lines_after(offset: TextSize, code: &str) -> u32 {

 /// Returns the position after skipping any trailing trivia up to, but not including the newline character.
 pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
-    let rest = &code[usize::from(offset)..];
-    let mut iter = rest.char_indices();
+    let tokenizer = SimpleTokenizer::starts_at(offset, code);

-    while let Some((relative_offset, c)) = iter.next() {
-        match c {
-            '\n' | '\r' => return offset + TextSize::try_from(relative_offset).unwrap(),
-            '#' => {
-                // Skip the comment
-                let newline_offset = iter
-                    .as_str()
-                    .find(['\n', '\r'])
-                    .unwrap_or(iter.as_str().len());
-
-                return offset
-                    + TextSize::try_from(relative_offset + '#'.len_utf8() + newline_offset)
-                        .unwrap();
+    for token in tokenizer {
+        match token.kind() {
+            TokenKind::Whitespace | TokenKind::Comment | TokenKind::Continuation => {
+                // No op
+            }
+            _ => {
+                return token.start();
            }
-            c if is_python_whitespace(c) => continue,
-            _ => return offset + TextSize::try_from(relative_offset).unwrap(),
        }
    }

-    offset + rest.text_len()
+    offset
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
+pub(crate) struct Token {
+    pub(crate) kind: TokenKind,
+    pub(crate) range: TextRange,
+}
+
+impl Token {
+    pub(crate) const fn kind(&self) -> TokenKind {
+        self.kind
+    }
+
+    #[allow(unused)]
+    pub(crate) const fn range(&self) -> TextRange {
+        self.range
+    }
+
+    pub(crate) const fn start(&self) -> TextSize {
+        self.range.start()
+    }
+
+    #[allow(unused)]
+    pub(crate) const fn end(&self) -> TextSize {
+        self.range.start()
+    }
+}
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
+pub(crate) enum TokenKind {
+    /// A comment, not including the trailing new line.
+    Comment,
+
+    /// Sequence of ' ' or '\t'
+    Whitespace,
+
+    /// Start or end of the file
+    EndOfFile,
+
+    /// `\\`
+    Continuation,
+
+    /// `\n` or `\r` or `\r\n`
+    Newline,
+
+    /// `(`
+    LParen,
+
+    /// `)`
+    RParen,
+
+    /// `{`
+    LBrace,
+
+    /// `}`
+    RBrace,
+
+    /// `[`
+    LBracket,
+
+    /// `]`
+    RBracket,
+
+    /// `,`
+    Comma,
+
+    /// `:`
+    Colon,
+
+    /// '/'
+    Slash,
+
+    /// Any other non trivia token. Always has a length of 1
+    Other,
+
+    /// Returned for each character after [`TokenKind::Other`] has been returned once.
+    Bogus,
+}
+
+impl TokenKind {
+    const fn from_non_trivia_char(c: char) -> TokenKind {
+        match c {
+            '(' => TokenKind::LParen,
+            ')' => TokenKind::RParen,
+            '[' => TokenKind::LBracket,
+            ']' => TokenKind::RBracket,
+            '{' => TokenKind::LBrace,
+            '}' => TokenKind::RBrace,
+            ',' => TokenKind::Comma,
+            ':' => TokenKind::Colon,
+            '/' => TokenKind::Slash,
+            _ => TokenKind::Other,
+        }
+    }
+
+    const fn is_trivia(self) -> bool {
+        matches!(
+            self,
+            TokenKind::Whitespace
+                | TokenKind::Newline
+                | TokenKind::Comment
+                | TokenKind::Continuation
+        )
+    }
+}
+
+/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
+///
+/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
+///
+/// The tokenizer doesn't guarantee any correctness after it returned a [`TokenKind::Other`]. That's why it
+/// will return [`TokenKind::Bogus`] for every character after until it reaches the end of the file.
+pub(crate) struct SimpleTokenizer<'a> {
+    offset: TextSize,
+    back_offset: TextSize,
+    /// `true` when it is known that the current `back` line has no comment for sure.
+    back_line_has_no_comment: bool,
+    bogus: bool,
+    cursor: Cursor<'a>,
+}
+
+impl<'a> SimpleTokenizer<'a> {
+    pub(crate) fn new(source: &'a str, range: TextRange) -> Self {
+        Self {
+            offset: range.start(),
+            back_offset: range.end(),
+            back_line_has_no_comment: false,
+            bogus: false,
+            cursor: Cursor::new(&source[range]),
+        }
+    }
+
+    pub(crate) fn starts_at(offset: TextSize, source: &'a str) -> Self {
+        let range = TextRange::new(offset, source.text_len());
+        Self::new(source, range)
+    }
+
+    pub(crate) fn up_to(offset: TextSize, source: &'a str) -> Self {
+        Self::new(source, TextRange::up_to(offset))
+    }
+
+    fn next_token(&mut self) -> Token {
+        self.cursor.start_token();
+
+        let Some(first) = self.cursor.bump() else {
+            return Token {
+                kind: TokenKind::EndOfFile,
+                range: TextRange::empty(self.offset),
+            }
+        };
+
+        if self.bogus {
+            let token = Token {
+                kind: TokenKind::Bogus,
+                range: TextRange::at(self.offset, first.text_len()),
+            };
+
+            self.offset += first.text_len();
+            return token;
+        }
+
+        let kind = match first {
+            ' ' | '\t' => {
+                self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
+                TokenKind::Whitespace
+            }
+
+            '\n' => TokenKind::Newline,
+
+            '\r' => {
+                self.cursor.eat_char('\n');
+                TokenKind::Newline
+            }
+
+            '#' => {
+                self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
+                TokenKind::Comment
+            }
+
+            '\\' => TokenKind::Continuation,
+
+            c => {
+                let kind = TokenKind::from_non_trivia_char(c);
+
+                if kind == TokenKind::Other {
+                    self.bogus = true;
+                }
+
+                kind
+            }
+        };
+
+        let token_len = self.cursor.token_len();
+
+        let token = Token {
+            kind,
+            range: TextRange::at(self.offset, token_len),
+        };
+
+        self.offset += token_len;
+
+        token
+    }
+
+    /// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
+    /// because it needs to check if the line has any comments when encountering any non-trivia token.
+    pub(crate) fn next_token_back(&mut self) -> Token {
+        self.cursor.start_token();
+
+        let Some(last) = self.cursor.bump_back() else {
+            return Token {
+                kind: TokenKind::EndOfFile,
+                range: TextRange::empty(self.back_offset),
+            }
+        };
+
+        if self.bogus {
+            let token = Token {
+                kind: TokenKind::Bogus,
+                range: TextRange::at(self.back_offset - last.text_len(), last.text_len()),
+            };
+
+            self.back_offset -= last.text_len();
+            return token;
+        }
+
+        let kind = match last {
+            // This may not be 100% correct because it will lex-out trailing whitespace from a comment
+            // as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
+            ' ' | '\t' => {
+                self.cursor.eat_back_while(|c| matches!(c, ' ' | '\t'));
+                TokenKind::Whitespace
+            }
+
+            '\r' => {
+                self.back_line_has_no_comment = false;
+                TokenKind::Newline
+            }
+
+            '\n' => {
+                self.back_line_has_no_comment = false;
+                self.cursor.eat_char_back('\r');
+                TokenKind::Newline
+            }
+
+            // Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
+            '#' => TokenKind::Comment,
+
+            // For all other tokens, test if the character isn't part of a comment.
+            c => {
+                let mut comment_offset = None;
+
+                // Skip the test whether there's a preceding comment if it has been performed before.
+                if !self.back_line_has_no_comment {
+                    let rest = self.cursor.chars.as_str();
+
+                    for (back_index, c) in rest.chars().rev().enumerate() {
+                        match c {
+                            '#' => {
+                                // Potentially a comment
+                                comment_offset = Some(back_index + 1);
+                            }
+                            '\r' | '\n' | '\\' => {
+                                break;
+                            }
+                            c => {
+                                if !is_python_whitespace(c)
+                                    && TokenKind::from_non_trivia_char(c) == TokenKind::Other
+                                {
+                                    comment_offset = None;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                // From here on it is guaranteed that this line has no other comment.
+                self.back_line_has_no_comment = true;
+
+                if let Some(comment_offset) = comment_offset {
+                    // It is a comment, bump all tokens
+                    for _ in 0..comment_offset {
+                        self.cursor.bump_back().unwrap();
+                    }
+
+                    TokenKind::Comment
+                } else if c == '\\' {
+                    TokenKind::Continuation
+                } else {
+                    let kind = TokenKind::from_non_trivia_char(c);
+
+                    if kind == TokenKind::Other {
+                        self.bogus = true;
+                    }
+
+                    kind
+                }
+            }
+        };
+
+        let token_len = self.cursor.token_len();
+
+        let start = self.back_offset - token_len;
+
+        let token = Token {
+            kind,
+            range: TextRange::at(start, token_len),
+        };
+
+        self.back_offset = start;
+
+        token
+    }
+
+    pub(crate) fn skip_trivia(self) -> impl Iterator<Item = Token> + DoubleEndedIterator + 'a {
+        self.filter(|t| !t.kind().is_trivia())
+    }
+}
+
+impl Iterator for SimpleTokenizer<'_> {
+    type Item = Token;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let token = self.next_token();
+
+        if token.kind == TokenKind::EndOfFile {
+            None
+        } else {
+            Some(token)
+        }
+    }
+}
+
+impl DoubleEndedIterator for SimpleTokenizer<'_> {
+    fn next_back(&mut self) -> Option<Self::Item> {
+        let token = self.next_token_back();
+
+        if token.kind == TokenKind::EndOfFile {
+            None
+        } else {
+            Some(token)
+        }
+    }
+}
+
+const EOF_CHAR: char = '\0';
+
+#[derive(Debug, Clone)]
+struct Cursor<'a> {
+    chars: Chars<'a>,
+    source_length: TextSize,
+}
+
+impl<'a> Cursor<'a> {
+    fn new(source: &'a str) -> Self {
+        Self {
+            source_length: source.text_len(),
+            chars: source.chars(),
+        }
+    }
+
+    /// Peeks the next character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    fn first(&self) -> char {
+        self.chars.clone().next().unwrap_or(EOF_CHAR)
+    }
+
+    /// Peeks the next character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    fn last(&self) -> char {
+        self.chars.clone().next_back().unwrap_or(EOF_CHAR)
+    }
+
+    // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
+    #[allow(clippy::cast_possible_truncation)]
+    fn text_len(&self) -> TextSize {
+        TextSize::new(self.chars.as_str().len() as u32)
+    }
+
+    fn token_len(&self) -> TextSize {
+        self.source_length - self.text_len()
+    }
+
+    fn start_token(&mut self) {
+        self.source_length = self.text_len();
+    }
+
+    fn is_eof(&self) -> bool {
+        self.chars.as_str().is_empty()
+    }
+
+    /// Consumes the next character
+    fn bump(&mut self) -> Option<char> {
+        self.chars.next()
+    }
+
+    /// Consumes the next character from the back
+    fn bump_back(&mut self) -> Option<char> {
+        self.chars.next_back()
+    }
+
+    fn eat_char(&mut self, c: char) -> bool {
+        if self.first() == c {
+            self.bump();
+            true
+        } else {
+            false
+        }
+    }
+
+    fn eat_char_back(&mut self, c: char) -> bool {
+        if self.last() == c {
+            self.bump_back();
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.first()) && !self.is_eof() {
+            self.bump();
+        }
+    }
+
+    /// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
+    fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.last()) && !self.is_eof() {
+            self.bump_back();
+        }
+    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::trivia::{lines_after, lines_before};
-    use ruff_text_size::TextSize;
+    use crate::trivia::{lines_after, lines_before, SimpleTokenizer, Token};
+    use insta::assert_debug_snapshot;
+    use ruff_text_size::{TextLen, TextRange, TextSize};
+
+    struct TokenizationTestCase {
+        source: &'static str,
+        range: TextRange,
+        tokens: Vec<Token>,
+    }
+
+    impl TokenizationTestCase {
+        fn assert_reverse_tokenization(&self) {
+            let mut backwards = self.tokenize_reverse();
+
+            // Re-reverse to get the tokens in forward order.
+            backwards.reverse();
+
+            assert_eq!(&backwards, &self.tokens);
+        }
+
+        fn tokenize_reverse(&self) -> Vec<Token> {
+            SimpleTokenizer::new(self.source, self.range)
+                .rev()
+                .collect()
+        }
+
+        fn tokens(&self) -> &[Token] {
+            &self.tokens
+        }
+    }
+
+    fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase {
+        let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect();
+
+        TokenizationTestCase {
+            source,
+            range,
+            tokens,
+        }
+    }
+
+    fn tokenize(source: &'static str) -> TokenizationTestCase {
+        tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len()))
+    }
+
+    #[test]
+    fn tokenize_trivia() {
+        let source = "# comment\n    # comment";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_parentheses() {
+        let source = "([{}])";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_comma() {
+        let source = ",,,,";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_continuation() {
+        let source = "( \\\n )";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_substring() {
+        let source = "('some string') # comment";
+
+        let test_case =
+            tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len()));
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_slash() {
+        let source = r#" # trailing positional comment
+        # Positional arguments only after here
+        ,/"#;
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_bogus() {
+        let source = r#"# leading comment
+        "a string"
+        a = (10)"#;
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
+    }

    #[test]
    fn lines_before_empty_string() {