mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-26 11:59:35 +00:00
Add Tokens
newtype wrapper, TokenKind
iterator (#11361)
## Summary Alternative to #11237 This PR adds a new `Tokens` struct which is a newtype wrapper around a vector of lexer output. This allows us to add a `kinds` method which returns an iterator over the corresponding `TokenKind`. This iterator is implemented as a separate `TokenKindIter` struct to allow using the type and provide additional methods like `peek` directly on the iterator. This exposes the linter to access the stream of `TokenKind` instead of `Tok`. Edit: I've made the necessary downstream changes and plan to merge the entire stack at once.
This commit is contained in:
parent
50f14d017e
commit
025768d303
9 changed files with 142 additions and 25 deletions
|
@ -321,7 +321,6 @@ mod tests {
|
|||
|
||||
use ruff_python_ast::PySourceType;
|
||||
use ruff_python_codegen::Stylist;
|
||||
use ruff_python_parser::lexer::LexResult;
|
||||
use ruff_python_parser::{parse_suite, Mode};
|
||||
use ruff_source_file::{LineEnding, Locator};
|
||||
use ruff_text_size::TextSize;
|
||||
|
@ -332,7 +331,7 @@ mod tests {
|
|||
fn start_of_file() -> Result<()> {
|
||||
fn insert(contents: &str) -> Result<Insertion> {
|
||||
let program = parse_suite(contents)?;
|
||||
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, Mode::Module);
|
||||
let tokens = ruff_python_parser::tokenize(contents, Mode::Module);
|
||||
let locator = Locator::new(contents);
|
||||
let stylist = Stylist::from_tokens(&tokens, &locator);
|
||||
Ok(Insertion::start_of_file(&program, &locator, &stylist))
|
||||
|
@ -443,7 +442,7 @@ x = 1
|
|||
#[test]
|
||||
fn start_of_block() {
|
||||
fn insert(contents: &str, offset: TextSize) -> Insertion {
|
||||
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, Mode::Module);
|
||||
let tokens = ruff_python_parser::tokenize(contents, Mode::Module);
|
||||
let locator = Locator::new(contents);
|
||||
let stylist = Stylist::from_tokens(&tokens, &locator);
|
||||
Insertion::start_of_block(offset, &locator, &stylist, PySourceType::default())
|
||||
|
|
|
@ -14,7 +14,7 @@ use ruff_python_ast::{PySourceType, Suite};
|
|||
use ruff_python_codegen::Stylist;
|
||||
use ruff_python_index::Indexer;
|
||||
use ruff_python_parser::lexer::LexResult;
|
||||
use ruff_python_parser::{AsMode, ParseError};
|
||||
use ruff_python_parser::{AsMode, ParseError, TokenKindIter, Tokens};
|
||||
use ruff_source_file::{Locator, SourceFileBuilder};
|
||||
use ruff_text_size::Ranged;
|
||||
|
||||
|
@ -353,7 +353,7 @@ pub fn add_noqa_to_path(
|
|||
let contents = source_kind.source_code();
|
||||
|
||||
// Tokenize once.
|
||||
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, source_type.as_mode());
|
||||
let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode());
|
||||
|
||||
// Map row and column locations to byte slices (lazily).
|
||||
let locator = Locator::new(contents);
|
||||
|
@ -518,8 +518,7 @@ pub fn lint_fix<'a>(
|
|||
// Continuously fix until the source code stabilizes.
|
||||
loop {
|
||||
// Tokenize once.
|
||||
let tokens: Vec<LexResult> =
|
||||
ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode());
|
||||
let tokens = ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode());
|
||||
|
||||
// Map row and column locations to byte slices (lazily).
|
||||
let locator = Locator::new(transformed.source_code());
|
||||
|
@ -715,7 +714,7 @@ impl<'a> ParseSource<'a> {
|
|||
#[derive(Debug, Clone)]
|
||||
pub enum TokenSource<'a> {
|
||||
/// Use the precomputed tokens to generate the AST.
|
||||
Tokens(Vec<LexResult>),
|
||||
Tokens(Tokens),
|
||||
/// Use the precomputed tokens and AST.
|
||||
Precomputed {
|
||||
tokens: &'a [LexResult],
|
||||
|
@ -723,6 +722,18 @@ pub enum TokenSource<'a> {
|
|||
},
|
||||
}
|
||||
|
||||
impl TokenSource<'_> {
|
||||
/// Returns an iterator over the [`TokenKind`] and the corresponding range.
|
||||
///
|
||||
/// [`TokenKind`]: ruff_python_parser::TokenKind
|
||||
pub fn kinds(&self) -> TokenKindIter {
|
||||
match self {
|
||||
TokenSource::Tokens(tokens) => tokens.kinds(),
|
||||
TokenSource::Precomputed { tokens, .. } => TokenKindIter::new(tokens),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for TokenSource<'_> {
|
||||
type Target = [LexResult];
|
||||
|
||||
|
|
|
@ -11,7 +11,6 @@ mod tests {
|
|||
|
||||
use anyhow::Result;
|
||||
use regex::Regex;
|
||||
use ruff_python_parser::lexer::LexResult;
|
||||
|
||||
use test_case::test_case;
|
||||
|
||||
|
@ -591,7 +590,7 @@ mod tests {
|
|||
let source_type = PySourceType::default();
|
||||
let source_kind = SourceKind::Python(contents.to_string());
|
||||
let settings = LinterSettings::for_rules(Linter::Pyflakes.rules());
|
||||
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(&contents, source_type.as_mode());
|
||||
let tokens = ruff_python_parser::tokenize(&contents, source_type.as_mode());
|
||||
let locator = Locator::new(&contents);
|
||||
let stylist = Stylist::from_tokens(&tokens, &locator);
|
||||
let indexer = Indexer::from_tokens(&tokens, &locator);
|
||||
|
|
|
@ -16,7 +16,6 @@ use ruff_notebook::NotebookError;
|
|||
use ruff_python_ast::PySourceType;
|
||||
use ruff_python_codegen::Stylist;
|
||||
use ruff_python_index::Indexer;
|
||||
use ruff_python_parser::lexer::LexResult;
|
||||
use ruff_python_parser::AsMode;
|
||||
use ruff_python_trivia::textwrap::dedent;
|
||||
use ruff_source_file::{Locator, SourceFileBuilder};
|
||||
|
@ -111,8 +110,7 @@ pub(crate) fn test_contents<'a>(
|
|||
settings: &LinterSettings,
|
||||
) -> (Vec<Message>, Cow<'a, SourceKind>) {
|
||||
let source_type = PySourceType::from(path);
|
||||
let tokens: Vec<LexResult> =
|
||||
ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode());
|
||||
let tokens = ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode());
|
||||
let locator = Locator::new(source_kind.source_code());
|
||||
let stylist = Stylist::from_tokens(&tokens, &locator);
|
||||
let indexer = Indexer::from_tokens(&tokens, &locator);
|
||||
|
@ -177,7 +175,7 @@ pub(crate) fn test_contents<'a>(
|
|||
|
||||
transformed = Cow::Owned(transformed.updated(fixed_contents, &source_map));
|
||||
|
||||
let tokens: Vec<LexResult> =
|
||||
let tokens =
|
||||
ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode());
|
||||
let locator = Locator::new(transformed.source_code());
|
||||
let stylist = Stylist::from_tokens(&tokens, &locator);
|
||||
|
|
|
@ -110,6 +110,9 @@
|
|||
//! [parsing]: https://en.wikipedia.org/wiki/Parsing
|
||||
//! [lexer]: crate::lexer
|
||||
|
||||
use std::iter::FusedIterator;
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::lexer::{lex, lex_starts_at, LexResult};
|
||||
|
||||
pub use crate::error::{FStringErrorType, ParseError, ParseErrorType};
|
||||
|
@ -117,7 +120,7 @@ pub use crate::parser::Program;
|
|||
pub use crate::token::{Tok, TokenKind};
|
||||
|
||||
use ruff_python_ast::{Expr, Mod, ModModule, PySourceType, Suite};
|
||||
use ruff_text_size::TextSize;
|
||||
use ruff_text_size::{Ranged, TextRange, TextSize};
|
||||
|
||||
mod error;
|
||||
pub mod lexer;
|
||||
|
@ -339,8 +342,113 @@ pub fn parse_tokens(tokens: Vec<LexResult>, source: &str, mode: Mode) -> Result<
|
|||
}
|
||||
}
|
||||
|
||||
/// Tokens represents a vector of [`LexResult`].
|
||||
///
|
||||
/// This should only include tokens up to and including the first error. This struct is created
|
||||
/// by the [`tokenize`] function.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Tokens(Vec<LexResult>);
|
||||
|
||||
impl Tokens {
|
||||
/// Returns an iterator over the [`TokenKind`] and the range corresponding to the tokens.
|
||||
pub fn kinds(&self) -> TokenKindIter {
|
||||
TokenKindIter::new(&self.0)
|
||||
}
|
||||
|
||||
/// Returns an iterator over the [`TokenKind`] and its range for all the tokens that are
|
||||
/// within the given `range`.
|
||||
///
|
||||
/// The start and end position of the given range should correspond to the start position of
|
||||
/// the first token and the end position of the last token in the returned iterator.
|
||||
///
|
||||
/// For example, if the struct contains the following tokens:
|
||||
/// ```txt
|
||||
/// (Def, 0..3)
|
||||
/// (Name, 4..7)
|
||||
/// (Lpar, 7..8)
|
||||
/// (Rpar, 8..9)
|
||||
/// (Colon, 9..10)
|
||||
/// (Ellipsis, 11..14)
|
||||
/// (Newline, 14..14)
|
||||
/// ```
|
||||
///
|
||||
/// Then, the range `4..10` returns an iterator which yields `Name`, `Lpar`, `Rpar`, and
|
||||
/// `Colon` token. But, if the given position doesn't match any of the tokens, an empty
|
||||
/// iterator is returned.
|
||||
pub fn kinds_within_range<T: Ranged>(&self, ranged: T) -> TokenKindIter {
|
||||
let Ok(start_index) = self.binary_search_by_key(&ranged.start(), |result| match result {
|
||||
Ok((_, range)) => range.start(),
|
||||
Err(error) => error.location().start(),
|
||||
}) else {
|
||||
return TokenKindIter::default();
|
||||
};
|
||||
|
||||
let Ok(end_index) = self.binary_search_by_key(&ranged.end(), |result| match result {
|
||||
Ok((_, range)) => range.end(),
|
||||
Err(error) => error.location().end(),
|
||||
}) else {
|
||||
return TokenKindIter::default();
|
||||
};
|
||||
|
||||
TokenKindIter::new(self.get(start_index..=end_index).unwrap_or(&[]))
|
||||
}
|
||||
|
||||
/// Consumes the [`Tokens`], returning the underlying vector of [`LexResult`].
|
||||
pub fn into_inner(self) -> Vec<LexResult> {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Tokens {
|
||||
type Target = [LexResult];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the [`TokenKind`] and the corresponding range.
|
||||
///
|
||||
/// This struct is created by the [`Tokens::kinds`] method.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct TokenKindIter<'a> {
|
||||
inner: std::iter::Flatten<std::slice::Iter<'a, LexResult>>,
|
||||
}
|
||||
|
||||
impl<'a> TokenKindIter<'a> {
|
||||
/// Create a new iterator from a slice of [`LexResult`].
|
||||
pub fn new(tokens: &'a [LexResult]) -> Self {
|
||||
Self {
|
||||
inner: tokens.iter().flatten(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the next value without advancing the iterator.
|
||||
pub fn peek(&mut self) -> Option<(TokenKind, TextRange)> {
|
||||
self.clone().next()
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for TokenKindIter<'_> {
|
||||
type Item = (TokenKind, TextRange);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let &(ref tok, range) = self.inner.next()?;
|
||||
Some((TokenKind::from_token(tok), range))
|
||||
}
|
||||
}
|
||||
|
||||
impl FusedIterator for TokenKindIter<'_> {}
|
||||
|
||||
impl DoubleEndedIterator for TokenKindIter<'_> {
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
let &(ref tok, range) = self.inner.next_back()?;
|
||||
Some((TokenKind::from_token(tok), range))
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect tokens up to and including the first error.
|
||||
pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
|
||||
pub fn tokenize(contents: &str, mode: Mode) -> Tokens {
|
||||
let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents);
|
||||
for tok in lexer::lex(contents, mode) {
|
||||
let is_err = tok.is_err();
|
||||
|
@ -350,7 +458,7 @@ pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
|
|||
}
|
||||
}
|
||||
|
||||
tokens
|
||||
Tokens(tokens)
|
||||
}
|
||||
|
||||
/// Tokenizes all tokens.
|
||||
|
@ -380,7 +488,7 @@ fn approximate_tokens_lower_bound(contents: &str) -> usize {
|
|||
|
||||
/// Parse a full Python program from its tokens.
|
||||
pub fn parse_program_tokens(
|
||||
tokens: Vec<LexResult>,
|
||||
tokens: Tokens,
|
||||
source: &str,
|
||||
is_jupyter_notebook: bool,
|
||||
) -> anyhow::Result<Suite, ParseError> {
|
||||
|
@ -389,7 +497,7 @@ pub fn parse_program_tokens(
|
|||
} else {
|
||||
Mode::Module
|
||||
};
|
||||
match parse_tokens(tokens, source, mode)? {
|
||||
match parse_tokens(tokens.into_inner(), source, mode)? {
|
||||
Mod::Module(m) => Ok(m.body),
|
||||
Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
|
||||
}
|
||||
|
|
|
@ -228,6 +228,11 @@ pub enum Tok {
|
|||
}
|
||||
|
||||
impl Tok {
|
||||
#[inline]
|
||||
pub fn kind(&self) -> TokenKind {
|
||||
TokenKind::from_token(self)
|
||||
}
|
||||
|
||||
pub fn start_marker(mode: Mode) -> Self {
|
||||
match mode {
|
||||
Mode::Module | Mode::Ipython => Tok::StartModule,
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
use ruff_python_index::Indexer;
|
||||
use ruff_python_parser::lexer::LexResult;
|
||||
use ruff_python_parser::{tokenize, Mode};
|
||||
use ruff_source_file::Locator;
|
||||
use ruff_text_size::TextSize;
|
||||
|
@ -38,7 +37,7 @@ fn block_comments_indented_block() {
|
|||
fn block_comments_single_line_is_not_a_block() {
|
||||
// arrange
|
||||
let source = "\n";
|
||||
let tokens: Vec<LexResult> = tokenize(source, Mode::Module);
|
||||
let tokens = tokenize(source, Mode::Module);
|
||||
let locator = Locator::new(source);
|
||||
let indexer = Indexer::from_tokens(&tokens, &locator);
|
||||
|
||||
|
|
|
@ -13,7 +13,6 @@ use ruff_linter::{
|
|||
use ruff_python_ast::PySourceType;
|
||||
use ruff_python_codegen::Stylist;
|
||||
use ruff_python_index::Indexer;
|
||||
use ruff_python_parser::lexer::LexResult;
|
||||
use ruff_python_parser::AsMode;
|
||||
use ruff_source_file::Locator;
|
||||
use ruff_text_size::Ranged;
|
||||
|
@ -76,7 +75,7 @@ pub(crate) fn check(
|
|||
let source_kind = SourceKind::Python(contents.to_string());
|
||||
|
||||
// Tokenize once.
|
||||
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, source_type.as_mode());
|
||||
let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode());
|
||||
|
||||
// Map row and column locations to byte slices (lazily).
|
||||
let locator = Locator::with_index(contents, index);
|
||||
|
|
|
@ -17,7 +17,6 @@ use ruff_python_ast::{Mod, PySourceType};
|
|||
use ruff_python_codegen::Stylist;
|
||||
use ruff_python_formatter::{format_module_ast, pretty_comments, PyFormatContext, QuoteStyle};
|
||||
use ruff_python_index::{CommentRangesBuilder, Indexer};
|
||||
use ruff_python_parser::lexer::LexResult;
|
||||
use ruff_python_parser::{parse_tokens, tokenize_all, AsMode, Mode, Program};
|
||||
use ruff_python_trivia::CommentRanges;
|
||||
use ruff_source_file::{Locator, SourceLocation};
|
||||
|
@ -162,7 +161,7 @@ impl Workspace {
|
|||
let source_kind = SourceKind::Python(contents.to_string());
|
||||
|
||||
// Tokenize once.
|
||||
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, source_type.as_mode());
|
||||
let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode());
|
||||
|
||||
// Map row and column locations to byte slices (lazily).
|
||||
let locator = Locator::new(contents);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue