Add Tokens newtype wrapper, TokenKind iterator (#11361)

## Summary Alternative to #11237 This PR adds a new `Tokens` struct which is a newtype wrapper around a vector of lexer output. This allows us to add a `kinds` method which returns an iterator over the corresponding `TokenKind`. This iterator is implemented as a separate `TokenKindIter` struct to allow using the type and provide additional methods like `peek` directly on the iterator. This exposes the linter to access the stream of `TokenKind` instead of `Tok`. Edit: I've made the necessary downstream changes and plan to merge the entire stack at once.
2025-09-22 01:52:50 +00:00 · 2024-05-14 22:15:04 +05:30 · 2024-05-14 22:15:04 +05:30 · 025768d303
commit 025768d303
parent 50f14d017e
9 changed files with 142 additions and 25 deletions
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@ -110,6 +110,9 @@
 //! [parsing]: https://en.wikipedia.org/wiki/Parsing
 //! [lexer]: crate::lexer

+use std::iter::FusedIterator;
+use std::ops::Deref;
+
 use crate::lexer::{lex, lex_starts_at, LexResult};

 pub use crate::error::{FStringErrorType, ParseError, ParseErrorType};
@ -117,7 +120,7 @@ pub use crate::parser::Program;
 pub use crate::token::{Tok, TokenKind};

 use ruff_python_ast::{Expr, Mod, ModModule, PySourceType, Suite};
-use ruff_text_size::TextSize;
+use ruff_text_size::{Ranged, TextRange, TextSize};

 mod error;
 pub mod lexer;
@ -339,8 +342,113 @@ pub fn parse_tokens(tokens: Vec<LexResult>, source: &str, mode: Mode) -> Result<
    }
 }

+/// Tokens represents a vector of [`LexResult`].
+///
+/// This should only include tokens up to and including the first error. This struct is created
+/// by the [`tokenize`] function.
+#[derive(Debug, Clone)]
+pub struct Tokens(Vec<LexResult>);
+
+impl Tokens {
+    /// Returns an iterator over the [`TokenKind`] and the range corresponding to the tokens.
+    pub fn kinds(&self) -> TokenKindIter {
+        TokenKindIter::new(&self.0)
+    }
+
+    /// Returns an iterator over the [`TokenKind`] and its range for all the tokens that are
+    /// within the given `range`.
+    ///
+    /// The start and end position of the given range should correspond to the start position of
+    /// the first token and the end position of the last token in the returned iterator.
+    ///
+    /// For example, if the struct contains the following tokens:
+    /// ```txt
+    /// (Def, 0..3)
+    /// (Name, 4..7)
+    /// (Lpar, 7..8)
+    /// (Rpar, 8..9)
+    /// (Colon, 9..10)
+    /// (Ellipsis, 11..14)
+    /// (Newline, 14..14)
+    /// ```
+    ///
+    /// Then, the range `4..10` returns an iterator which yields `Name`, `Lpar`, `Rpar`, and
+    /// `Colon` token. But, if the given position doesn't match any of the tokens, an empty
+    /// iterator is returned.
+    pub fn kinds_within_range<T: Ranged>(&self, ranged: T) -> TokenKindIter {
+        let Ok(start_index) = self.binary_search_by_key(&ranged.start(), |result| match result {
+            Ok((_, range)) => range.start(),
+            Err(error) => error.location().start(),
+        }) else {
+            return TokenKindIter::default();
+        };
+
+        let Ok(end_index) = self.binary_search_by_key(&ranged.end(), |result| match result {
+            Ok((_, range)) => range.end(),
+            Err(error) => error.location().end(),
+        }) else {
+            return TokenKindIter::default();
+        };
+
+        TokenKindIter::new(self.get(start_index..=end_index).unwrap_or(&[]))
+    }
+
+    /// Consumes the [`Tokens`], returning the underlying vector of [`LexResult`].
+    pub fn into_inner(self) -> Vec<LexResult> {
+        self.0
+    }
+}
+
+impl Deref for Tokens {
+    type Target = [LexResult];
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// An iterator over the [`TokenKind`] and the corresponding range.
+///
+/// This struct is created by the [`Tokens::kinds`] method.
+#[derive(Clone, Default)]
+pub struct TokenKindIter<'a> {
+    inner: std::iter::Flatten<std::slice::Iter<'a, LexResult>>,
+}
+
+impl<'a> TokenKindIter<'a> {
+    /// Create a new iterator from a slice of [`LexResult`].
+    pub fn new(tokens: &'a [LexResult]) -> Self {
+        Self {
+            inner: tokens.iter().flatten(),
+        }
+    }
+
+    /// Return the next value without advancing the iterator.
+    pub fn peek(&mut self) -> Option<(TokenKind, TextRange)> {
+        self.clone().next()
+    }
+}
+
+impl Iterator for TokenKindIter<'_> {
+    type Item = (TokenKind, TextRange);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let &(ref tok, range) = self.inner.next()?;
+        Some((TokenKind::from_token(tok), range))
+    }
+}
+
+impl FusedIterator for TokenKindIter<'_> {}
+
+impl DoubleEndedIterator for TokenKindIter<'_> {
+    fn next_back(&mut self) -> Option<Self::Item> {
+        let &(ref tok, range) = self.inner.next_back()?;
+        Some((TokenKind::from_token(tok), range))
+    }
+}
+
 /// Collect tokens up to and including the first error.
-pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
+pub fn tokenize(contents: &str, mode: Mode) -> Tokens {
    let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents);
    for tok in lexer::lex(contents, mode) {
        let is_err = tok.is_err();
@ -350,7 +458,7 @@ pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
        }
    }

-    tokens
+    Tokens(tokens)
 }

 /// Tokenizes all tokens.
@ -380,7 +488,7 @@ fn approximate_tokens_lower_bound(contents: &str) -> usize {

 /// Parse a full Python program from its tokens.
 pub fn parse_program_tokens(
-    tokens: Vec<LexResult>,
+    tokens: Tokens,
    source: &str,
    is_jupyter_notebook: bool,
 ) -> anyhow::Result<Suite, ParseError> {
@ -389,7 +497,7 @@ pub fn parse_program_tokens(
    } else {
        Mode::Module
    };
-    match parse_tokens(tokens, source, mode)? {
+    match parse_tokens(tokens.into_inner(), source, mode)? {
        Mod::Module(m) => Ok(m.body),
        Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
    }
--- a/crates/ruff_python_parser/src/token.rs
+++ b/crates/ruff_python_parser/src/token.rs
@ -228,6 +228,11 @@ pub enum Tok {
 }

 impl Tok {
+    #[inline]
+    pub fn kind(&self) -> TokenKind {
+        TokenKind::from_token(self)
+    }
+
    pub fn start_marker(mode: Mode) -> Self {
        match mode {
            Mode::Module | Mode::Ipython => Tok::StartModule,