diff --git a/crates/ruff/src/checkers/ast/mod.rs b/crates/ruff/src/checkers/ast/mod.rs index aff756000b..9fac7dde22 100644 --- a/crates/ruff/src/checkers/ast/mod.rs +++ b/crates/ruff/src/checkers/ast/mod.rs @@ -18,7 +18,7 @@ use ruff_python_ast::str::trailing_quote; use ruff_python_ast::types::Node; use ruff_python_ast::typing::{parse_type_annotation, AnnotationKind}; use ruff_python_ast::visitor::{walk_except_handler, walk_pattern, Visitor}; -use ruff_python_ast::{cast, helpers, identifier, str, visitor}; +use ruff_python_ast::{cast, helpers, str, visitor}; use ruff_python_semantic::analyze::{branch_detection, typing, visibility}; use ruff_python_semantic::{ Binding, BindingFlags, BindingId, BindingKind, ContextualizedDefinition, Exceptions, @@ -251,9 +251,8 @@ where // Pre-visit. match stmt { Stmt::Global(ast::StmtGlobal { names, range: _ }) => { - let ranges: Vec = identifier::names(stmt, self.locator).collect(); if !self.semantic.scope_id.is_global() { - for (name, range) in names.iter().zip(ranges.iter()) { + for name in names { if let Some(binding_id) = self.semantic.global_scope().get(name) { // Mark the binding in the global scope as "rebound" in the current scope. self.semantic @@ -262,7 +261,7 @@ where // Add a binding to the current scope. let binding_id = self.semantic.push_binding( - *range, + name.range(), BindingKind::Global, BindingFlags::GLOBAL, ); @@ -272,21 +271,19 @@ where } if self.enabled(Rule::AmbiguousVariableName) { - self.diagnostics - .extend(names.iter().zip(ranges.iter()).filter_map(|(name, range)| { - pycodestyle::rules::ambiguous_variable_name(name, *range) - })); + self.diagnostics.extend(names.iter().filter_map(|name| { + pycodestyle::rules::ambiguous_variable_name(name, name.range()) + })); } } Stmt::Nonlocal(ast::StmtNonlocal { names, range: _ }) => { - let ranges: Vec = identifier::names(stmt, self.locator).collect(); if !self.semantic.scope_id.is_global() { - for (name, range) in names.iter().zip(ranges.iter()) { + for name in names { if let Some((scope_id, binding_id)) = self.semantic.nonlocal(name) { // Mark the binding as "used". self.semantic.add_local_reference( binding_id, - *range, + name.range(), ExecutionContext::Runtime, ); @@ -297,7 +294,7 @@ where // Add a binding to the current scope. let binding_id = self.semantic.push_binding( - *range, + name.range(), BindingKind::Nonlocal(scope_id), BindingFlags::NONLOCAL, ); @@ -309,17 +306,16 @@ where pylint::rules::NonlocalWithoutBinding { name: name.to_string(), }, - *range, + name.range(), )); } } } } if self.enabled(Rule::AmbiguousVariableName) { - self.diagnostics - .extend(names.iter().zip(ranges.iter()).filter_map(|(name, range)| { - pycodestyle::rules::ambiguous_variable_name(name, *range) - })); + self.diagnostics.extend(names.iter().filter_map(|name| { + pycodestyle::rules::ambiguous_variable_name(name, name.range()) + })); } } Stmt::Break(_) => { diff --git a/crates/ruff_python_ast/src/identifier.rs b/crates/ruff_python_ast/src/identifier.rs index d8ddc8606c..fb801f098e 100644 --- a/crates/ruff_python_ast/src/identifier.rs +++ b/crates/ruff_python_ast/src/identifier.rs @@ -1,29 +1,20 @@ //! Extract [`TextRange`] information from AST nodes. //! -//! In the `RustPython` AST, each node has a `range` field that contains the -//! start and end byte offsets of the node. However, attributes on those -//! nodes may not have their own ranges. In particular, identifiers are -//! not given their own ranges, unless they're part of a name expression. -//! //! For example, given: //! ```python -//! def f(): +//! try: +//! ... +//! except Exception as e: //! ... //! ``` //! -//! The statement defining `f` has a range, but the identifier `f` does not. -//! -//! This module assists with extracting [`TextRange`] ranges from AST nodes -//! via manual lexical analysis. - -use std::ops::{Add, Sub}; -use std::str::Chars; +//! This module can be used to identify the [`TextRange`] of the `except` token. use ruff_text_size::{TextLen, TextRange, TextSize}; use rustpython_ast::{Alias, Arg, ArgWithDefault, Pattern}; use rustpython_parser::ast::{self, ExceptHandler, Ranged, Stmt}; -use ruff_python_whitespace::is_python_whitespace; +use ruff_python_whitespace::{is_python_whitespace, Cursor}; use crate::source_code::Locator; @@ -163,20 +154,6 @@ impl TryIdentifier for ExceptHandler { } } -/// Return the [`TextRange`] for every name in a [`Stmt`]. -/// -/// Intended to be used for `global` and `nonlocal` statements. -/// -/// For example, return the ranges of `x` and `y` in: -/// ```python -/// global x, y -/// ``` -pub fn names<'a>(stmt: &Stmt, locator: &'a Locator<'a>) -> impl Iterator + 'a { - // Given `global x, y`, the first identifier is `global`, and the remaining identifiers are - // the names. - IdentifierTokenizer::new(locator.contents(), stmt.range()).skip(1) -} - /// Return the [`TextRange`] of the `except` token in an [`ExceptHandler`]. pub fn except(handler: &ExceptHandler, locator: &Locator) -> TextRange { IdentifierTokenizer::new(locator.contents(), handler.range()) @@ -248,13 +225,15 @@ impl<'a> IdentifierTokenizer<'a> { } fn next_token(&mut self) -> Option { - while let Some(c) = self.cursor.bump() { + while let Some(c) = { + self.offset += self.cursor.token_len(); + self.cursor.start_token(); + self.cursor.bump() + } { match c { c if is_python_identifier_start(c) => { - let start = self.offset.add(self.cursor.offset()).sub(c.text_len()); self.cursor.eat_while(is_python_identifier_continue); - let end = self.offset.add(self.cursor.offset()); - return Some(TextRange::new(start, end)); + return Some(TextRange::at(self.offset, self.cursor.token_len())); } c if is_python_whitespace(c) => { @@ -295,73 +274,15 @@ impl Iterator for IdentifierTokenizer<'_> { } } -const EOF_CHAR: char = '\0'; - -#[derive(Debug, Clone)] -struct Cursor<'a> { - chars: Chars<'a>, - offset: TextSize, -} - -impl<'a> Cursor<'a> { - fn new(source: &'a str) -> Self { - Self { - chars: source.chars(), - offset: TextSize::from(0), - } - } - - const fn offset(&self) -> TextSize { - self.offset - } - - /// Peeks the next character from the input stream without consuming it. - /// Returns [`EOF_CHAR`] if the file is at the end of the file. - fn first(&self) -> char { - self.chars.clone().next().unwrap_or(EOF_CHAR) - } - - /// Returns `true` if the file is at the end of the file. - fn is_eof(&self) -> bool { - self.chars.as_str().is_empty() - } - - /// Consumes the next character. - fn bump(&mut self) -> Option { - if let Some(char) = self.chars.next() { - self.offset += char.text_len(); - Some(char) - } else { - None - } - } - - /// Eats the next character if it matches the given character. - fn eat_char(&mut self, c: char) -> bool { - if self.first() == c { - self.bump(); - true - } else { - false - } - } - - /// Eats symbols while predicate returns true or until the end of file is reached. - fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } -} - #[cfg(test)] mod tests { use anyhow::Result; use ruff_text_size::{TextRange, TextSize}; - use rustpython_ast::Stmt; + use rustpython_ast::{Ranged, Stmt}; use rustpython_parser::Parse; use crate::identifier; + use crate::identifier::IdentifierTokenizer; use crate::source_code::Locator; #[test] @@ -383,4 +304,33 @@ else: ); Ok(()) } + + #[test] + fn extract_global_names() -> Result<()> { + let contents = r#"global X,Y, Z"#.trim(); + let stmt = Stmt::parse(contents, "")?; + let locator = Locator::new(contents); + + let mut names = IdentifierTokenizer::new(locator.contents(), stmt.range()); + + let range = names.next_token().unwrap(); + assert_eq!(&contents[range], "global"); + assert_eq!(range, TextRange::new(TextSize::from(0), TextSize::from(6))); + + let range = names.next_token().unwrap(); + assert_eq!(&contents[range], "X"); + assert_eq!(range, TextRange::new(TextSize::from(7), TextSize::from(8))); + + let range = names.next_token().unwrap(); + assert_eq!(&contents[range], "Y"); + assert_eq!(range, TextRange::new(TextSize::from(9), TextSize::from(10))); + + let range = names.next_token().unwrap(); + assert_eq!(&contents[range], "Z"); + assert_eq!( + range, + TextRange::new(TextSize::from(12), TextSize::from(13)) + ); + Ok(()) + } } diff --git a/crates/ruff_python_formatter/Cargo.toml b/crates/ruff_python_formatter/Cargo.toml index dac4a95d80..381c3ec6c9 100644 --- a/crates/ruff_python_formatter/Cargo.toml +++ b/crates/ruff_python_formatter/Cargo.toml @@ -44,7 +44,6 @@ path = "tests/fixtures.rs" test = true required-features = [ "serde" ] - [features] serde = ["dep:serde", "ruff_formatter/serde"] default = ["serde"] diff --git a/crates/ruff_python_formatter/src/trivia.rs b/crates/ruff_python_formatter/src/trivia.rs index 4d8a1fa8cd..b6432e5336 100644 --- a/crates/ruff_python_formatter/src/trivia.rs +++ b/crates/ruff_python_formatter/src/trivia.rs @@ -1,9 +1,8 @@ -use std::str::Chars; - -use ruff_python_whitespace::is_python_whitespace; use ruff_text_size::{TextLen, TextRange, TextSize}; use unic_ucd_ident::{is_xid_continue, is_xid_start}; +use ruff_python_whitespace::{is_python_whitespace, Cursor}; + /// Searches for the first non-trivia character in `range`. /// /// The search skips over any whitespace and comments. @@ -402,9 +401,7 @@ impl<'a> SimpleTokenizer<'a> { // Skip the test whether there's a preceding comment if it has been performed before. if !self.back_line_has_no_comment { - let rest = self.cursor.chars.as_str(); - - for (back_index, c) in rest.chars().rev().enumerate() { + for (back_index, c) in self.cursor.chars().rev().enumerate() { match c { '#' => { // Potentially a comment @@ -515,100 +512,6 @@ impl DoubleEndedIterator for SimpleTokenizer<'_> { } } -const EOF_CHAR: char = '\0'; - -#[derive(Debug, Clone)] -struct Cursor<'a> { - chars: Chars<'a>, - source_length: TextSize, -} - -impl<'a> Cursor<'a> { - fn new(source: &'a str) -> Self { - Self { - source_length: source.text_len(), - chars: source.chars(), - } - } - - /// Peeks the next character from the input stream without consuming it. - /// Returns [`EOF_CHAR`] if the file is at the end of the file. - fn first(&self) -> char { - self.chars.clone().next().unwrap_or(EOF_CHAR) - } - - /// Peeks the next character from the input stream without consuming it. - /// Returns [`EOF_CHAR`] if the file is at the end of the file. - fn last(&self) -> char { - self.chars.clone().next_back().unwrap_or(EOF_CHAR) - } - - // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. - #[allow(clippy::cast_possible_truncation)] - fn text_len(&self) -> TextSize { - TextSize::new(self.chars.as_str().len() as u32) - } - - fn token_len(&self) -> TextSize { - self.source_length - self.text_len() - } - - fn start_token(&mut self) { - self.source_length = self.text_len(); - } - - /// Returns `true` if the file is at the end of the file. - fn is_eof(&self) -> bool { - self.chars.as_str().is_empty() - } - - /// Consumes the next character - fn bump(&mut self) -> Option { - self.chars.next() - } - - /// Consumes the next character from the back - fn bump_back(&mut self) -> Option { - self.chars.next_back() - } - - fn eat_char(&mut self, c: char) -> bool { - if self.first() == c { - self.bump(); - true - } else { - false - } - } - - fn eat_char_back(&mut self, c: char) -> bool { - if self.last() == c { - self.bump_back(); - true - } else { - false - } - } - - /// Eats symbols while predicate returns true or until the end of file is reached. - fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - // It was tried making optimized version of this for eg. line comments, but - // LLVM can inline all of this and compile it down to fast iteration over bytes. - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } - - /// Eats symbols from the back while predicate returns true or until the beginning of file is reached. - fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - // It was tried making optimized version of this for eg. line comments, but - // LLVM can inline all of this and compile it down to fast iteration over bytes. - while predicate(self.last()) && !self.is_eof() { - self.bump_back(); - } - } -} - #[cfg(test)] mod tests { use insta::assert_debug_snapshot; diff --git a/crates/ruff_python_whitespace/src/cursor.rs b/crates/ruff_python_whitespace/src/cursor.rs new file mode 100644 index 0000000000..43a750cb4f --- /dev/null +++ b/crates/ruff_python_whitespace/src/cursor.rs @@ -0,0 +1,103 @@ +use std::str::Chars; + +use ruff_text_size::{TextLen, TextSize}; + +pub const EOF_CHAR: char = '\0'; + +/// A [`Cursor`] over a string. +#[derive(Debug, Clone)] +pub struct Cursor<'a> { + chars: Chars<'a>, + source_length: TextSize, +} + +impl<'a> Cursor<'a> { + pub fn new(source: &'a str) -> Self { + Self { + source_length: source.text_len(), + chars: source.chars(), + } + } + + /// Return the remaining input as a string slice. + pub fn chars(&self) -> Chars<'a> { + self.chars.clone() + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [`EOF_CHAR`] if the file is at the end of the file. + pub fn first(&self) -> char { + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [`EOF_CHAR`] if the file is at the end of the file. + pub fn last(&self) -> char { + self.chars.clone().next_back().unwrap_or(EOF_CHAR) + } + + // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + #[allow(clippy::cast_possible_truncation)] + pub fn text_len(&self) -> TextSize { + TextSize::new(self.chars.as_str().len() as u32) + } + + pub fn token_len(&self) -> TextSize { + self.source_length - self.text_len() + } + + pub fn start_token(&mut self) { + self.source_length = self.text_len(); + } + + /// Returns `true` if the file is at the end of the file. + pub fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Consumes the next character + pub fn bump(&mut self) -> Option { + self.chars.next() + } + + /// Consumes the next character from the back + pub fn bump_back(&mut self) -> Option { + self.chars.next_back() + } + + pub fn eat_char(&mut self, c: char) -> bool { + if self.first() == c { + self.bump(); + true + } else { + false + } + } + + pub fn eat_char_back(&mut self, c: char) -> bool { + if self.last() == c { + self.bump_back(); + true + } else { + false + } + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } + + /// Eats symbols from the back while predicate returns true or until the beginning of file is reached. + pub fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.last()) && !self.is_eof() { + self.bump_back(); + } + } +} diff --git a/crates/ruff_python_whitespace/src/lib.rs b/crates/ruff_python_whitespace/src/lib.rs index 36d3ddee97..b8c95e351c 100644 --- a/crates/ruff_python_whitespace/src/lib.rs +++ b/crates/ruff_python_whitespace/src/lib.rs @@ -1,5 +1,7 @@ +mod cursor; mod newlines; mod whitespace; +pub use cursor::*; pub use newlines::*; pub use whitespace::*;