Rename ruff_python_whitespace to ruff_python_trivia (#5886)

## Summary This crate now contains utilities for dealing with trivia more broadly: whitespace, newlines, "simple" trivia lexing, etc. So renaming it to reflect its increased responsibilities. To avoid conflicts, I've also renamed `Token` and `TokenKind` to `SimpleToken` and `SimpleTokenKind`.
2025-10-17 13:58:37 +00:00 · 2023-07-19 11:48:27 -04:00 · 2023-07-19 11:48:27 -04:00 · 5f3da9955a
commit 5f3da9955a
parent a75a6de577
86 changed files with 360 additions and 353 deletions
--- a/crates/ruff_python_trivia/Cargo.toml
+++ b/crates/ruff_python_trivia/Cargo.toml
@ -0,0 +1,22 @@
+[package]
+name = "ruff_python_trivia"
+version = "0.0.0"
+publish = false
+authors = { workspace = true }
+edition = { workspace = true }
+rust-version = { workspace = true }
+homepage = { workspace = true }
+documentation = { workspace = true }
+repository = { workspace = true }
+license = { workspace = true }
+
+[lib]
+
+[dependencies]
+ruff_text_size = { workspace = true }
+
+memchr = { workspace = true }
+unic-ucd-ident = "0.9.0"
+
+[dev-dependencies]
+insta = { workspace = true }
--- a/crates/ruff_python_trivia/src/cursor.rs
+++ b/crates/ruff_python_trivia/src/cursor.rs
@ -0,0 +1,103 @@
+use std::str::Chars;
+
+use ruff_text_size::{TextLen, TextSize};
+
+pub const EOF_CHAR: char = '\0';
+
+/// A [`Cursor`] over a string.
+#[derive(Debug, Clone)]
+pub struct Cursor<'a> {
+    chars: Chars<'a>,
+    source_length: TextSize,
+}
+
+impl<'a> Cursor<'a> {
+    pub fn new(source: &'a str) -> Self {
+        Self {
+            source_length: source.text_len(),
+            chars: source.chars(),
+        }
+    }
+
+    /// Return the remaining input as a string slice.
+    pub fn chars(&self) -> Chars<'a> {
+        self.chars.clone()
+    }
+
+    /// Peeks the next character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    pub fn first(&self) -> char {
+        self.chars.clone().next().unwrap_or(EOF_CHAR)
+    }
+
+    /// Peeks the next character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    pub fn last(&self) -> char {
+        self.chars.clone().next_back().unwrap_or(EOF_CHAR)
+    }
+
+    // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
+    #[allow(clippy::cast_possible_truncation)]
+    pub fn text_len(&self) -> TextSize {
+        TextSize::new(self.chars.as_str().len() as u32)
+    }
+
+    pub fn token_len(&self) -> TextSize {
+        self.source_length - self.text_len()
+    }
+
+    pub fn start_token(&mut self) {
+        self.source_length = self.text_len();
+    }
+
+    /// Returns `true` if the file is at the end of the file.
+    pub fn is_eof(&self) -> bool {
+        self.chars.as_str().is_empty()
+    }
+
+    /// Consumes the next character
+    pub fn bump(&mut self) -> Option<char> {
+        self.chars.next()
+    }
+
+    /// Consumes the next character from the back
+    pub fn bump_back(&mut self) -> Option<char> {
+        self.chars.next_back()
+    }
+
+    pub fn eat_char(&mut self, c: char) -> bool {
+        if self.first() == c {
+            self.bump();
+            true
+        } else {
+            false
+        }
+    }
+
+    pub fn eat_char_back(&mut self, c: char) -> bool {
+        if self.last() == c {
+            self.bump_back();
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    pub fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.first()) && !self.is_eof() {
+            self.bump();
+        }
+    }
+
+    /// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
+    pub fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.last()) && !self.is_eof() {
+            self.bump_back();
+        }
+    }
+}
--- a/crates/ruff_python_trivia/src/lib.rs
+++ b/crates/ruff_python_trivia/src/lib.rs
@ -0,0 +1,9 @@
+mod cursor;
+mod newlines;
+mod tokenizer;
+mod whitespace;
+
+pub use cursor::*;
+pub use newlines::*;
+pub use tokenizer::*;
+pub use whitespace::*;
--- a/crates/ruff_python_trivia/src/newlines.rs
+++ b/crates/ruff_python_trivia/src/newlines.rs
@ -0,0 +1,453 @@
+use std::iter::FusedIterator;
+use std::ops::Deref;
+
+use memchr::{memchr2, memrchr2};
+use ruff_text_size::{TextLen, TextRange, TextSize};
+
+/// Extension trait for [`str`] that provides a [`UniversalNewlineIterator`].
+pub trait UniversalNewlines {
+    fn universal_newlines(&self) -> UniversalNewlineIterator<'_>;
+}
+
+impl UniversalNewlines for str {
+    fn universal_newlines(&self) -> UniversalNewlineIterator<'_> {
+        UniversalNewlineIterator::from(self)
+    }
+}
+
+/// Like [`str#lines`], but accommodates LF, CRLF, and CR line endings,
+/// the latter of which are not supported by [`str#lines`].
+///
+/// ## Examples
+///
+/// ```rust
+/// # use ruff_text_size::TextSize;
+/// # use ruff_python_trivia::{Line, UniversalNewlineIterator};
+/// let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
+///
+/// assert_eq!(lines.next_back(), Some(Line::new("bop", TextSize::from(14))));
+/// assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
+/// assert_eq!(lines.next_back(), Some(Line::new("baz\r", TextSize::from(10))));
+/// assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
+/// assert_eq!(lines.next_back(), Some(Line::new("\r\n", TextSize::from(8))));
+/// assert_eq!(lines.next(), None);
+/// ```
+pub struct UniversalNewlineIterator<'a> {
+    text: &'a str,
+    offset: TextSize,
+    offset_back: TextSize,
+}
+
+impl<'a> UniversalNewlineIterator<'a> {
+    pub fn with_offset(text: &'a str, offset: TextSize) -> UniversalNewlineIterator<'a> {
+        UniversalNewlineIterator {
+            text,
+            offset,
+            offset_back: offset + text.text_len(),
+        }
+    }
+
+    pub fn from(text: &'a str) -> UniversalNewlineIterator<'a> {
+        Self::with_offset(text, TextSize::default())
+    }
+}
+
+/// Finds the next newline character. Returns its position and the [`LineEnding`].
+#[inline]
+pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> {
+    let bytes = text.as_bytes();
+    if let Some(position) = memchr2(b'\n', b'\r', bytes) {
+        // SAFETY: memchr guarantees to return valid positions
+        #[allow(unsafe_code)]
+        let newline_character = unsafe { *bytes.get_unchecked(position) };
+
+        let line_ending = match newline_character {
+            // Explicit branch for `\n` as this is the most likely path
+            b'\n' => LineEnding::Lf,
+            // '\r\n'
+            b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf,
+            // '\r'
+            _ => LineEnding::Cr,
+        };
+
+        Some((position, line_ending))
+    } else {
+        None
+    }
+}
+
+impl<'a> Iterator for UniversalNewlineIterator<'a> {
+    type Item = Line<'a>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Line<'a>> {
+        if self.text.is_empty() {
+            return None;
+        }
+
+        let line = if let Some((newline_position, line_ending)) = find_newline(self.text) {
+            let (text, remainder) = self.text.split_at(newline_position + line_ending.len());
+
+            let line = Line {
+                offset: self.offset,
+                text,
+            };
+
+            self.text = remainder;
+            self.offset += text.text_len();
+
+            line
+        }
+        // Last line
+        else {
+            Line {
+                offset: self.offset,
+                text: std::mem::take(&mut self.text),
+            }
+        };
+
+        Some(line)
+    }
+
+    fn last(mut self) -> Option<Self::Item> {
+        self.next_back()
+    }
+}
+
+impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
+    #[inline]
+    fn next_back(&mut self) -> Option<Self::Item> {
+        if self.text.is_empty() {
+            return None;
+        }
+
+        let len = self.text.len();
+
+        // Trim any trailing newlines.
+        let haystack = match self.text.as_bytes()[len - 1] {
+            b'\n' if len > 1 && self.text.as_bytes()[len - 2] == b'\r' => &self.text[..len - 2],
+            b'\n' | b'\r' => &self.text[..len - 1],
+            _ => self.text,
+        };
+
+        // Find the end of the previous line. The previous line is the text up to, but not including
+        // the newline character.
+        let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) {
+            // '\n' or '\r' or '\r\n'
+            let (remainder, line) = self.text.split_at(line_end + 1);
+            self.text = remainder;
+            self.offset_back -= line.text_len();
+
+            Line {
+                text: line,
+                offset: self.offset_back,
+            }
+        } else {
+            // Last line
+            let offset = self.offset_back - self.text.text_len();
+            Line {
+                text: std::mem::take(&mut self.text),
+                offset,
+            }
+        };
+
+        Some(line)
+    }
+}
+
+impl FusedIterator for UniversalNewlineIterator<'_> {}
+
+/// Like [`UniversalNewlineIterator`], but includes a trailing newline as an empty line.
+pub struct NewlineWithTrailingNewline<'a> {
+    trailing: Option<Line<'a>>,
+    underlying: UniversalNewlineIterator<'a>,
+}
+
+impl<'a> NewlineWithTrailingNewline<'a> {
+    pub fn from(input: &'a str) -> NewlineWithTrailingNewline<'a> {
+        Self::with_offset(input, TextSize::default())
+    }
+
+    pub fn with_offset(input: &'a str, offset: TextSize) -> Self {
+        NewlineWithTrailingNewline {
+            underlying: UniversalNewlineIterator::with_offset(input, offset),
+            trailing: if input.ends_with(['\r', '\n']) {
+                Some(Line {
+                    text: "",
+                    offset: offset + input.text_len(),
+                })
+            } else {
+                None
+            },
+        }
+    }
+}
+
+impl<'a> Iterator for NewlineWithTrailingNewline<'a> {
+    type Item = Line<'a>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Line<'a>> {
+        self.underlying.next().or_else(|| self.trailing.take())
+    }
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct Line<'a> {
+    text: &'a str,
+    offset: TextSize,
+}
+
+impl<'a> Line<'a> {
+    pub fn new(text: &'a str, offset: TextSize) -> Self {
+        Self { text, offset }
+    }
+
+    #[inline]
+    pub const fn start(&self) -> TextSize {
+        self.offset
+    }
+
+    /// Returns the byte offset where the line ends, including its terminating new line character.
+    #[inline]
+    pub fn full_end(&self) -> TextSize {
+        self.offset + self.full_text_len()
+    }
+
+    /// Returns the byte offset where the line ends, excluding its new line character
+    #[inline]
+    pub fn end(&self) -> TextSize {
+        self.offset + self.as_str().text_len()
+    }
+
+    /// Returns the range of the line, including its terminating new line character.
+    #[inline]
+    pub fn full_range(&self) -> TextRange {
+        TextRange::at(self.offset, self.text.text_len())
+    }
+
+    /// Returns the range of the line, excluding its terminating new line character
+    #[inline]
+    pub fn range(&self) -> TextRange {
+        TextRange::new(self.start(), self.end())
+    }
+
+    /// Returns the line's new line character, if any.
+    #[inline]
+    pub fn line_ending(&self) -> Option<LineEnding> {
+        let mut bytes = self.text.bytes().rev();
+        match bytes.next() {
+            Some(b'\n') => {
+                if bytes.next() == Some(b'\r') {
+                    Some(LineEnding::CrLf)
+                } else {
+                    Some(LineEnding::Lf)
+                }
+            }
+            Some(b'\r') => Some(LineEnding::Cr),
+            _ => None,
+        }
+    }
+
+    /// Returns the text of the line, excluding the terminating new line character.
+    #[inline]
+    pub fn as_str(&self) -> &'a str {
+        let newline_len = self
+            .line_ending()
+            .map_or(0, |line_ending| line_ending.len());
+        &self.text[..self.text.len() - newline_len]
+    }
+
+    /// Returns the line's text, including the terminating new line character.
+    #[inline]
+    pub fn as_full_str(&self) -> &'a str {
+        self.text
+    }
+
+    #[inline]
+    pub fn full_text_len(&self) -> TextSize {
+        self.text.text_len()
+    }
+}
+
+impl Deref for Line<'_> {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+impl PartialEq<&str> for Line<'_> {
+    fn eq(&self, other: &&str) -> bool {
+        self.as_str() == *other
+    }
+}
+
+impl PartialEq<Line<'_>> for &str {
+    fn eq(&self, other: &Line<'_>) -> bool {
+        *self == other.as_str()
+    }
+}
+
+/// The line ending style used in Python source code.
+/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum LineEnding {
+    Lf,
+    Cr,
+    CrLf,
+}
+
+impl Default for LineEnding {
+    fn default() -> Self {
+        if cfg!(windows) {
+            LineEnding::CrLf
+        } else {
+            LineEnding::Lf
+        }
+    }
+}
+
+impl LineEnding {
+    pub const fn as_str(&self) -> &'static str {
+        match self {
+            LineEnding::Lf => "\n",
+            LineEnding::CrLf => "\r\n",
+            LineEnding::Cr => "\r",
+        }
+    }
+
+    #[allow(clippy::len_without_is_empty)]
+    pub const fn len(&self) -> usize {
+        match self {
+            LineEnding::Lf | LineEnding::Cr => 1,
+            LineEnding::CrLf => 2,
+        }
+    }
+
+    pub const fn text_len(&self) -> TextSize {
+        match self {
+            LineEnding::Lf | LineEnding::Cr => TextSize::new(1),
+            LineEnding::CrLf => TextSize::new(2),
+        }
+    }
+}
+
+impl Deref for LineEnding {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use ruff_text_size::TextSize;
+
+    use super::{Line, UniversalNewlineIterator};
+
+    #[test]
+    fn universal_newlines_empty_str() {
+        let lines: Vec<_> = UniversalNewlineIterator::from("").collect();
+        assert_eq!(lines, Vec::<Line>::new());
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("").rev().collect();
+        assert_eq!(lines, Vec::<Line>::new());
+    }
+
+    #[test]
+    fn universal_newlines_forward() {
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop").collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("foo\n", TextSize::from(0)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("bop", TextSize::from(14)),
+            ]
+        );
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n").collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("foo\n", TextSize::from(0)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("bop\n", TextSize::from(14)),
+            ]
+        );
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n\n").collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("foo\n", TextSize::from(0)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("bop\n", TextSize::from(14)),
+                Line::new("\n", TextSize::from(18)),
+            ]
+        );
+    }
+
+    #[test]
+    fn universal_newlines_backwards() {
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop")
+            .rev()
+            .collect();
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("bop", TextSize::from(14)),
+                Line::new("baz\r", TextSize::from(10)),
+                Line::new("\r\n", TextSize::from(8)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("foo\n", TextSize::from(0)),
+            ]
+        );
+
+        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\nbaz\rbop\n")
+            .rev()
+            .map(|line| line.as_str())
+            .collect();
+
+        assert_eq!(
+            lines,
+            vec![
+                Line::new("bop\n", TextSize::from(13)),
+                Line::new("baz\r", TextSize::from(9)),
+                Line::new("\n", TextSize::from(8)),
+                Line::new("bar\n", TextSize::from(4)),
+                Line::new("foo\n", TextSize::from(0)),
+            ]
+        );
+    }
+
+    #[test]
+    fn universal_newlines_mixed() {
+        let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
+
+        assert_eq!(
+            lines.next_back(),
+            Some(Line::new("bop", TextSize::from(14)))
+        );
+        assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
+        assert_eq!(
+            lines.next_back(),
+            Some(Line::new("baz\r", TextSize::from(10)))
+        );
+        assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
+        assert_eq!(
+            lines.next_back(),
+            Some(Line::new("\r\n", TextSize::from(8)))
+        );
+        assert_eq!(lines.next(), None);
+    }
+}
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__Reverse.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__Reverse.snap
@ -0,0 +1,218 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokenize_reverse()
+---
+[
+    SimpleToken {
+        kind: RParen,
+        range: 52..53,
+    },
+    SimpleToken {
+        kind: Other,
+        range: 51..52,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 50..51,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 49..50,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 48..49,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 47..48,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 46..47,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 45..46,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 44..45,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 43..44,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 42..43,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 41..42,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 40..41,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 39..40,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 38..39,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 37..38,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 36..37,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 35..36,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 34..35,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 33..34,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 32..33,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 31..32,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 30..31,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 29..30,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 28..29,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 27..28,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 26..27,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 25..26,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 24..25,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 23..24,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 22..23,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 21..22,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 20..21,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 19..20,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 18..19,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 17..18,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 16..17,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 15..16,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 14..15,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 13..14,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 12..13,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 11..12,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 10..11,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 9..10,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 8..9,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 7..8,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 6..7,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 5..6,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 4..5,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 3..4,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 2..3,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 1..2,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 0..1,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__identifier_ending_in_non_start_char.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__identifier_ending_in_non_start_char.snap
@ -0,0 +1,10 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Other,
+        range: 0..2,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__ignore_word_with_only_id_continuing_chars.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__ignore_word_with_only_id_continuing_chars.snap
@ -0,0 +1,18 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Other,
+        range: 0..1,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 1..2,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 2..3,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_bogus.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_bogus.snap
@ -0,0 +1,126 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Comment,
+        range: 0..17,
+    },
+    SimpleToken {
+        kind: Newline,
+        range: 17..18,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 18..26,
+    },
+    SimpleToken {
+        kind: Other,
+        range: 26..27,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 27..28,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 28..29,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 29..30,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 30..31,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 31..32,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 32..33,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 33..34,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 34..35,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 35..36,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 36..37,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 37..38,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 38..39,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 39..40,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 40..41,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 41..42,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 42..43,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 43..44,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 44..45,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 45..46,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 46..47,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 47..48,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 48..49,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 49..50,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 50..51,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 51..52,
+    },
+    SimpleToken {
+        kind: Bogus,
+        range: 52..53,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_comma.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_comma.snap
@ -0,0 +1,22 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Comma,
+        range: 0..1,
+    },
+    SimpleToken {
+        kind: Comma,
+        range: 1..2,
+    },
+    SimpleToken {
+        kind: Comma,
+        range: 2..3,
+    },
+    SimpleToken {
+        kind: Comma,
+        range: 3..4,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_continuation.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_continuation.snap
@ -0,0 +1,30 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: LParen,
+        range: 0..1,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 1..2,
+    },
+    SimpleToken {
+        kind: Continuation,
+        range: 2..3,
+    },
+    SimpleToken {
+        kind: Newline,
+        range: 3..4,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 4..5,
+    },
+    SimpleToken {
+        kind: RParen,
+        range: 5..6,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_multichar.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_multichar.snap
@ -0,0 +1,34 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: If,
+        range: 0..2,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 2..3,
+    },
+    SimpleToken {
+        kind: In,
+        range: 3..5,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 5..6,
+    },
+    SimpleToken {
+        kind: Else,
+        range: 6..10,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 10..11,
+    },
+    SimpleToken {
+        kind: Match,
+        range: 11..16,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_parentheses.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_parentheses.snap
@ -0,0 +1,30 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: LParen,
+        range: 0..1,
+    },
+    SimpleToken {
+        kind: LBracket,
+        range: 1..2,
+    },
+    SimpleToken {
+        kind: LBrace,
+        range: 2..3,
+    },
+    SimpleToken {
+        kind: RBrace,
+        range: 3..4,
+    },
+    SimpleToken {
+        kind: RBracket,
+        range: 4..5,
+    },
+    SimpleToken {
+        kind: RParen,
+        range: 5..6,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_slash.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_slash.snap
@ -0,0 +1,42 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Whitespace,
+        range: 0..1,
+    },
+    SimpleToken {
+        kind: Comment,
+        range: 1..30,
+    },
+    SimpleToken {
+        kind: Newline,
+        range: 30..31,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 31..39,
+    },
+    SimpleToken {
+        kind: Comment,
+        range: 39..77,
+    },
+    SimpleToken {
+        kind: Newline,
+        range: 77..78,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 78..86,
+    },
+    SimpleToken {
+        kind: Comma,
+        range: 86..87,
+    },
+    SimpleToken {
+        kind: Slash,
+        range: 87..88,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_substring.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_substring.snap
@ -0,0 +1,18 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: RParen,
+        range: 14..15,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 15..16,
+    },
+    SimpleToken {
+        kind: Comment,
+        range: 16..25,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_trivia.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tokenize_trivia.snap
@ -0,0 +1,22 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Comment,
+        range: 0..9,
+    },
+    SimpleToken {
+        kind: Newline,
+        range: 9..10,
+    },
+    SimpleToken {
+        kind: Whitespace,
+        range: 10..14,
+    },
+    SimpleToken {
+        kind: Comment,
+        range: 14..23,
+    },
+]
--- a/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tricky_unicode.snap
+++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_triviatokenizertests__tricky_unicode.snap
@ -0,0 +1,10 @@
+---
+source: crates/ruff_python_trivia/src/tokenizer.rs
+expression: test_case.tokens()
+---
+[
+    SimpleToken {
+        kind: Other,
+        range: 0..6,
+    },
+]
--- a/crates/ruff_python_trivia/src/tokenizer.rs
+++ b/crates/ruff_python_trivia/src/tokenizer.rs
@ -0,0 +1,784 @@
+use memchr::memrchr3_iter;
+use ruff_text_size::{TextLen, TextRange, TextSize};
+use unic_ucd_ident::{is_xid_continue, is_xid_start};
+
+use crate::{is_python_whitespace, Cursor};
+
+/// Searches for the first non-trivia character in `range`.
+///
+/// The search skips over any whitespace and comments.
+///
+/// Returns `Some` if the range contains any non-trivia character. The first item is the absolute offset
+/// of the character, the second item the non-trivia character.
+///
+/// Returns `None` if the range is empty or only contains trivia (whitespace or comments).
+pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
+    SimpleTokenizer::starts_at(offset, code)
+        .skip_trivia()
+        .next()
+}
+
+/// Returns the first non-trivia token right before `offset` or `None` if at the start of the file
+/// or all preceding tokens are trivia tokens.
+///
+/// ## Notes
+///
+/// Prefer [`first_non_trivia_token`] whenever possible because reverse lookup is expensive because of comments.
+pub fn first_non_trivia_token_rev(offset: TextSize, code: &str) -> Option<SimpleToken> {
+    SimpleTokenizer::up_to(offset, code)
+        .skip_trivia()
+        .next_back()
+}
+
+/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
+pub fn lines_before(offset: TextSize, code: &str) -> u32 {
+    let tokens = SimpleTokenizer::up_to(offset, code);
+    let mut newlines = 0u32;
+
+    for token in tokens.rev() {
+        match token.kind() {
+            SimpleTokenKind::Newline => {
+                newlines += 1;
+            }
+            SimpleTokenKind::Whitespace => {
+                // ignore
+            }
+            _ => {
+                break;
+            }
+        }
+    }
+
+    newlines
+}
+
+/// Counts the empty lines between `offset` and the first non-whitespace character.
+pub fn lines_after(offset: TextSize, code: &str) -> u32 {
+    let tokens = SimpleTokenizer::starts_at(offset, code);
+    let mut newlines = 0u32;
+
+    for token in tokens {
+        match token.kind() {
+            SimpleTokenKind::Newline => {
+                newlines += 1;
+            }
+            SimpleTokenKind::Whitespace => {
+                // ignore
+            }
+            _ => {
+                break;
+            }
+        }
+    }
+
+    newlines
+}
+
+/// Returns the position after skipping any trailing trivia up to, but not including the newline character.
+pub fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
+    let tokenizer = SimpleTokenizer::starts_at(offset, code);
+
+    for token in tokenizer {
+        match token.kind() {
+            SimpleTokenKind::Whitespace
+            | SimpleTokenKind::Comment
+            | SimpleTokenKind::Continuation => {
+                // No op
+            }
+            _ => {
+                return token.start();
+            }
+        }
+    }
+
+    offset
+}
+
+fn is_identifier_start(c: char) -> bool {
+    c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
+}
+
+// Checks if the character c is a valid continuation character as described
+// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
+fn is_identifier_continuation(c: char) -> bool {
+    if c.is_ascii() {
+        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
+    } else {
+        is_xid_continue(c)
+    }
+}
+
+fn is_non_ascii_identifier_start(c: char) -> bool {
+    is_xid_start(c)
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
+pub struct SimpleToken {
+    pub kind: SimpleTokenKind,
+    pub range: TextRange,
+}
+
+impl SimpleToken {
+    pub const fn kind(&self) -> SimpleTokenKind {
+        self.kind
+    }
+
+    #[allow(unused)]
+    pub const fn range(&self) -> TextRange {
+        self.range
+    }
+
+    pub const fn start(&self) -> TextSize {
+        self.range.start()
+    }
+
+    pub const fn end(&self) -> TextSize {
+        self.range.end()
+    }
+}
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
+pub enum SimpleTokenKind {
+    /// A comment, not including the trailing new line.
+    Comment,
+
+    /// Sequence of ' ' or '\t'
+    Whitespace,
+
+    /// Start or end of the file
+    EndOfFile,
+
+    /// `\\`
+    Continuation,
+
+    /// `\n` or `\r` or `\r\n`
+    Newline,
+
+    /// `(`
+    LParen,
+
+    /// `)`
+    RParen,
+
+    /// `{`
+    LBrace,
+
+    /// `}`
+    RBrace,
+
+    /// `[`
+    LBracket,
+
+    /// `]`
+    RBracket,
+
+    /// `,`
+    Comma,
+
+    /// `:`
+    Colon,
+
+    /// '/'
+    Slash,
+
+    /// '*'
+    Star,
+
+    /// `.`.
+    Dot,
+
+    /// `else`
+    Else,
+
+    /// `if`
+    If,
+
+    /// `in`
+    In,
+
+    /// `as`
+    As,
+
+    /// `match`
+    Match,
+
+    /// `with`
+    With,
+
+    /// `async`
+    Async,
+
+    /// Any other non trivia token.
+    Other,
+
+    /// Returned for each character after [`SimpleTokenKind::Other`] has been returned once.
+    Bogus,
+}
+
+impl SimpleTokenKind {
+    const fn from_non_trivia_char(c: char) -> SimpleTokenKind {
+        match c {
+            '(' => SimpleTokenKind::LParen,
+            ')' => SimpleTokenKind::RParen,
+            '[' => SimpleTokenKind::LBracket,
+            ']' => SimpleTokenKind::RBracket,
+            '{' => SimpleTokenKind::LBrace,
+            '}' => SimpleTokenKind::RBrace,
+            ',' => SimpleTokenKind::Comma,
+            ':' => SimpleTokenKind::Colon,
+            '/' => SimpleTokenKind::Slash,
+            '*' => SimpleTokenKind::Star,
+            '.' => SimpleTokenKind::Dot,
+            _ => SimpleTokenKind::Other,
+        }
+    }
+
+    const fn is_trivia(self) -> bool {
+        matches!(
+            self,
+            SimpleTokenKind::Whitespace
+                | SimpleTokenKind::Newline
+                | SimpleTokenKind::Comment
+                | SimpleTokenKind::Continuation
+        )
+    }
+}
+
+/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
+///
+/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
+///
+/// The tokenizer doesn't guarantee any correctness after it returned a [`SimpleTokenKind::Other`]. That's why it
+/// will return [`SimpleTokenKind::Bogus`] for every character after until it reaches the end of the file.
+pub struct SimpleTokenizer<'a> {
+    offset: TextSize,
+    back_offset: TextSize,
+    /// `true` when it is known that the current `back` line has no comment for sure.
+    back_line_has_no_comment: bool,
+    bogus: bool,
+    source: &'a str,
+    cursor: Cursor<'a>,
+}
+
+impl<'a> SimpleTokenizer<'a> {
+    pub fn new(source: &'a str, range: TextRange) -> Self {
+        Self {
+            offset: range.start(),
+            back_offset: range.end(),
+            back_line_has_no_comment: false,
+            bogus: false,
+            source,
+            cursor: Cursor::new(&source[range]),
+        }
+    }
+
+    pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
+        let range = TextRange::new(offset, source.text_len());
+        Self::new(source, range)
+    }
+
+    /// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
+    pub fn up_to(offset: TextSize, source: &'a str) -> Self {
+        Self::new(source, TextRange::up_to(offset))
+    }
+
+    /// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs
+    /// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing
+    /// because the lexer doesn't need to scan for comments.
+    pub fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self {
+        let mut tokenizer = Self::up_to(offset, source);
+        tokenizer.back_line_has_no_comment = true;
+        tokenizer
+    }
+
+    fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
+        let source = &self.source[range];
+        match source {
+            "as" => SimpleTokenKind::As,
+            "async" => SimpleTokenKind::Async,
+            "else" => SimpleTokenKind::Else,
+            "if" => SimpleTokenKind::If,
+            "in" => SimpleTokenKind::In,
+            "match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
+            "with" => SimpleTokenKind::With,
+            // ...,
+            _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+        }
+    }
+
+    fn next_token(&mut self) -> SimpleToken {
+        self.cursor.start_token();
+
+        let Some(first) = self.cursor.bump() else {
+            return SimpleToken {
+                kind: SimpleTokenKind::EndOfFile,
+                range: TextRange::empty(self.offset),
+            };
+        };
+
+        if self.bogus {
+            let token = SimpleToken {
+                kind: SimpleTokenKind::Bogus,
+                range: TextRange::at(self.offset, first.text_len()),
+            };
+
+            self.offset += first.text_len();
+            return token;
+        }
+
+        let kind = match first {
+            ' ' | '\t' => {
+                self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
+                SimpleTokenKind::Whitespace
+            }
+
+            '\n' => SimpleTokenKind::Newline,
+
+            '\r' => {
+                self.cursor.eat_char('\n');
+                SimpleTokenKind::Newline
+            }
+
+            '#' => {
+                self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
+                SimpleTokenKind::Comment
+            }
+
+            '\\' => SimpleTokenKind::Continuation,
+
+            c => {
+                let kind = if is_identifier_start(c) {
+                    self.cursor.eat_while(is_identifier_continuation);
+                    let token_len = self.cursor.token_len();
+
+                    let range = TextRange::at(self.offset, token_len);
+                    self.to_keyword_or_other(range)
+                } else {
+                    SimpleTokenKind::from_non_trivia_char(c)
+                };
+
+                if kind == SimpleTokenKind::Other {
+                    self.bogus = true;
+                }
+                kind
+            }
+        };
+
+        let token_len = self.cursor.token_len();
+
+        let token = SimpleToken {
+            kind,
+            range: TextRange::at(self.offset, token_len),
+        };
+
+        self.offset += token_len;
+
+        token
+    }
+
+    /// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
+    /// because it needs to check if the line has any comments when encountering any non-trivia token.
+    pub fn next_token_back(&mut self) -> SimpleToken {
+        self.cursor.start_token();
+
+        let Some(last) = self.cursor.bump_back() else {
+            return SimpleToken {
+                kind: SimpleTokenKind::EndOfFile,
+                range: TextRange::empty(self.back_offset),
+            };
+        };
+
+        if self.bogus {
+            let token = SimpleToken {
+                kind: SimpleTokenKind::Bogus,
+                range: TextRange::at(self.back_offset - last.text_len(), last.text_len()),
+            };
+
+            self.back_offset -= last.text_len();
+            return token;
+        }
+
+        let kind = match last {
+            // This may not be 100% correct because it will lex-out trailing whitespace from a comment
+            // as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
+            ' ' | '\t' => {
+                self.cursor.eat_back_while(|c| matches!(c, ' ' | '\t'));
+                SimpleTokenKind::Whitespace
+            }
+
+            '\r' => {
+                self.back_line_has_no_comment = false;
+                SimpleTokenKind::Newline
+            }
+
+            '\n' => {
+                self.back_line_has_no_comment = false;
+                self.cursor.eat_char_back('\r');
+                SimpleTokenKind::Newline
+            }
+
+            // Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
+            '#' => SimpleTokenKind::Comment,
+
+            // For all other tokens, test if the character isn't part of a comment.
+            c => {
+                // Skip the test whether there's a preceding comment if it has been performed before.
+                let comment_offset = if self.back_line_has_no_comment {
+                    None
+                } else {
+                    let bytes = self.cursor.chars().as_str().as_bytes();
+                    let mut line_start = 0;
+                    let mut last_comment_offset = None;
+
+                    // Find the start of the line, or any potential comments.
+                    for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) {
+                        if bytes[index] == b'#' {
+                            // Potentially a comment, but not guaranteed
+                            last_comment_offset = Some(index);
+                        } else {
+                            line_start = index + 1;
+                            break;
+                        }
+                    }
+
+                    // Verify if this is indeed a comment. Doing this only when we've found a comment is significantly
+                    // faster because comments are rare.
+                    last_comment_offset.filter(|last_comment_offset| {
+                        let before_comment =
+                            &self.cursor.chars().as_str()[line_start..*last_comment_offset];
+
+                        before_comment.chars().all(|c| {
+                            is_python_whitespace(c)
+                                || SimpleTokenKind::from_non_trivia_char(c)
+                                    != SimpleTokenKind::Other
+                        })
+                    })
+                };
+
+                // From here on it is guaranteed that this line has no other comment.
+                self.back_line_has_no_comment = true;
+
+                if let Some(comment_offset) = comment_offset {
+                    let comment_length = self.cursor.chars().as_str().len() - comment_offset;
+                    // It is a comment, bump all tokens
+                    for _ in 0..comment_length {
+                        self.cursor.bump_back().unwrap();
+                    }
+
+                    SimpleTokenKind::Comment
+                } else if c == '\\' {
+                    SimpleTokenKind::Continuation
+                } else {
+                    let kind = if is_identifier_continuation(c) {
+                        // if we only have identifier continuations but no start (e.g. 555) we
+                        // don't want to consume the chars, so in that case, we want to rewind the
+                        // cursor to here
+                        let savepoint = self.cursor.clone();
+                        self.cursor.eat_back_while(is_identifier_continuation);
+
+                        let token_len = self.cursor.token_len();
+                        let range = TextRange::at(self.back_offset - token_len, token_len);
+
+                        if self.source[range]
+                            .chars()
+                            .next()
+                            .is_some_and(is_identifier_start)
+                        {
+                            self.to_keyword_or_other(range)
+                        } else {
+                            self.cursor = savepoint;
+                            SimpleTokenKind::Other
+                        }
+                    } else {
+                        SimpleTokenKind::from_non_trivia_char(c)
+                    };
+
+                    if kind == SimpleTokenKind::Other {
+                        self.bogus = true;
+                    }
+
+                    kind
+                }
+            }
+        };
+
+        let token_len = self.cursor.token_len();
+
+        let start = self.back_offset - token_len;
+
+        let token = SimpleToken {
+            kind,
+            range: TextRange::at(start, token_len),
+        };
+
+        self.back_offset = start;
+
+        token
+    }
+
+    pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + DoubleEndedIterator + 'a {
+        self.filter(|t| !t.kind().is_trivia())
+    }
+}
+
+impl Iterator for SimpleTokenizer<'_> {
+    type Item = SimpleToken;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let token = self.next_token();
+
+        if token.kind == SimpleTokenKind::EndOfFile {
+            None
+        } else {
+            Some(token)
+        }
+    }
+}
+
+impl DoubleEndedIterator for SimpleTokenizer<'_> {
+    fn next_back(&mut self) -> Option<Self::Item> {
+        let token = self.next_token_back();
+
+        if token.kind == SimpleTokenKind::EndOfFile {
+            None
+        } else {
+            Some(token)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use insta::assert_debug_snapshot;
+    use ruff_text_size::{TextLen, TextRange, TextSize};
+
+    use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
+
+    struct TokenizationTestCase {
+        source: &'static str,
+        range: TextRange,
+        tokens: Vec<SimpleToken>,
+    }
+
+    impl TokenizationTestCase {
+        fn assert_reverse_tokenization(&self) {
+            let mut backwards = self.tokenize_reverse();
+
+            // Re-reverse to get the tokens in forward order.
+            backwards.reverse();
+
+            assert_eq!(&backwards, &self.tokens);
+        }
+
+        fn tokenize_reverse(&self) -> Vec<SimpleToken> {
+            SimpleTokenizer::new(self.source, self.range)
+                .rev()
+                .collect()
+        }
+
+        fn tokens(&self) -> &[SimpleToken] {
+            &self.tokens
+        }
+    }
+
+    fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase {
+        let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect();
+
+        TokenizationTestCase {
+            source,
+            range,
+            tokens,
+        }
+    }
+
+    fn tokenize(source: &'static str) -> TokenizationTestCase {
+        tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len()))
+    }
+
+    #[test]
+    fn tokenize_trivia() {
+        let source = "# comment\n    # comment";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_parentheses() {
+        let source = "([{}])";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_comma() {
+        let source = ",,,,";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_continuation() {
+        let source = "( \\\n )";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tricky_unicode() {
+        let source = "មុ";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn identifier_ending_in_non_start_char() {
+        let source = "i5";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn ignore_word_with_only_id_continuing_chars() {
+        let source = "555";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
+    }
+
+    #[test]
+    fn tokenize_multichar() {
+        let source = "if in else match";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_substring() {
+        let source = "('some string') # comment";
+
+        let test_case =
+            tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len()));
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_slash() {
+        let source = r#" # trailing positional comment
+        # Positional arguments only after here
+        ,/"#;
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn tokenize_bogus() {
+        let source = r#"# leading comment
+        "a string"
+        a = (10)"#;
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
+    }
+
+    #[test]
+    fn lines_before_empty_string() {
+        assert_eq!(lines_before(TextSize::new(0), ""), 0);
+    }
+
+    #[test]
+    fn lines_before_in_the_middle_of_a_line() {
+        assert_eq!(lines_before(TextSize::new(4), "a = 20"), 0);
+    }
+
+    #[test]
+    fn lines_before_on_a_new_line() {
+        assert_eq!(lines_before(TextSize::new(7), "a = 20\nb = 10"), 1);
+    }
+
+    #[test]
+    fn lines_before_multiple_leading_newlines() {
+        assert_eq!(lines_before(TextSize::new(9), "a = 20\n\r\nb = 10"), 2);
+    }
+
+    #[test]
+    fn lines_before_with_comment_offset() {
+        assert_eq!(lines_before(TextSize::new(8), "a = 20\n# a comment"), 0);
+    }
+
+    #[test]
+    fn lines_before_with_trailing_comment() {
+        assert_eq!(
+            lines_before(TextSize::new(22), "a = 20 # some comment\nb = 10"),
+            1
+        );
+    }
+
+    #[test]
+    fn lines_before_with_comment_only_line() {
+        assert_eq!(
+            lines_before(TextSize::new(22), "a = 20\n# some comment\nb = 10"),
+            1
+        );
+    }
+
+    #[test]
+    fn lines_after_empty_string() {
+        assert_eq!(lines_after(TextSize::new(0), ""), 0);
+    }
+
+    #[test]
+    fn lines_after_in_the_middle_of_a_line() {
+        assert_eq!(lines_after(TextSize::new(4), "a = 20"), 0);
+    }
+
+    #[test]
+    fn lines_after_before_a_new_line() {
+        assert_eq!(lines_after(TextSize::new(6), "a = 20\nb = 10"), 1);
+    }
+
+    #[test]
+    fn lines_after_multiple_newlines() {
+        assert_eq!(lines_after(TextSize::new(6), "a = 20\n\r\nb = 10"), 2);
+    }
+
+    #[test]
+    fn lines_after_before_comment_offset() {
+        assert_eq!(lines_after(TextSize::new(7), "a = 20 # a comment\n"), 0);
+    }
+
+    #[test]
+    fn lines_after_with_comment_only_line() {
+        assert_eq!(
+            lines_after(TextSize::new(6), "a = 20\n# some comment\nb = 10"),
+            1
+        );
+    }
+}
--- a/crates/ruff_python_trivia/src/whitespace.rs
+++ b/crates/ruff_python_trivia/src/whitespace.rs
@ -0,0 +1,43 @@
+/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
+/// characters.
+pub const fn is_python_whitespace(c: char) -> bool {
+    matches!(
+        c,
+        // Space, tab, or form-feed
+        ' ' | '\t' | '\x0C'
+    )
+}
+
+/// Extract the leading indentation from a line.
+pub fn leading_indentation(line: &str) -> &str {
+    line.find(|char: char| !is_python_whitespace(char))
+        .map_or(line, |index| &line[..index])
+}
+
+pub trait PythonWhitespace {
+    /// Like `str::trim()`, but only removes whitespace characters that Python considers
+    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
+    fn trim_whitespace(&self) -> &Self;
+
+    /// Like `str::trim_start()`, but only removes whitespace characters that Python considers
+    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
+    fn trim_whitespace_start(&self) -> &Self;
+
+    /// Like `str::trim_end()`, but only removes whitespace characters that Python considers
+    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
+    fn trim_whitespace_end(&self) -> &Self;
+}
+
+impl PythonWhitespace for str {
+    fn trim_whitespace(&self) -> &Self {
+        self.trim_matches(is_python_whitespace)
+    }
+
+    fn trim_whitespace_start(&self) -> &Self {
+        self.trim_start_matches(is_python_whitespace)
+    }
+
+    fn trim_whitespace_end(&self) -> &Self {
+        self.trim_end_matches(is_python_whitespace)
+    }
+}