Use shared Cursor across crates (#5715)

## Summary We have two `Cursor` implementations. This PR moves the implementation from the formatter into `ruff_python_whitespace` (kind of a poorly-named crate now) and uses it for both use-cases.
2025-07-24 05:25:17 +00:00 · 2023-07-12 17:09:27 -04:00 · 2023-07-12 17:09:27 -04:00 · 6dbc6d2e59
commit 6dbc6d2e59
parent 6ce252f0ed
6 changed files with 163 additions and 210 deletions
--- a/crates/ruff/src/checkers/ast/mod.rs
+++ b/crates/ruff/src/checkers/ast/mod.rs
@ -18,7 +18,7 @@ use ruff_python_ast::str::trailing_quote;
 use ruff_python_ast::types::Node;
 use ruff_python_ast::typing::{parse_type_annotation, AnnotationKind};
 use ruff_python_ast::visitor::{walk_except_handler, walk_pattern, Visitor};
-use ruff_python_ast::{cast, helpers, identifier, str, visitor};
+use ruff_python_ast::{cast, helpers, str, visitor};
 use ruff_python_semantic::analyze::{branch_detection, typing, visibility};
 use ruff_python_semantic::{
    Binding, BindingFlags, BindingId, BindingKind, ContextualizedDefinition, Exceptions,
@ -251,9 +251,8 @@ where
        // Pre-visit.
        match stmt {
            Stmt::Global(ast::StmtGlobal { names, range: _ }) => {
-                let ranges: Vec<TextRange> = identifier::names(stmt, self.locator).collect();
                if !self.semantic.scope_id.is_global() {
-                    for (name, range) in names.iter().zip(ranges.iter()) {
+                    for name in names {
                        if let Some(binding_id) = self.semantic.global_scope().get(name) {
                            // Mark the binding in the global scope as "rebound" in the current scope.
                            self.semantic
@ -262,7 +261,7 @@ where

                        // Add a binding to the current scope.
                        let binding_id = self.semantic.push_binding(
-                            *range,
+                            name.range(),
                            BindingKind::Global,
                            BindingFlags::GLOBAL,
                        );
@ -272,21 +271,19 @@ where
                }

                if self.enabled(Rule::AmbiguousVariableName) {
-                    self.diagnostics
-                        .extend(names.iter().zip(ranges.iter()).filter_map(|(name, range)| {
-                            pycodestyle::rules::ambiguous_variable_name(name, *range)
+                    self.diagnostics.extend(names.iter().filter_map(|name| {
+                        pycodestyle::rules::ambiguous_variable_name(name, name.range())
                    }));
                }
            }
            Stmt::Nonlocal(ast::StmtNonlocal { names, range: _ }) => {
-                let ranges: Vec<TextRange> = identifier::names(stmt, self.locator).collect();
                if !self.semantic.scope_id.is_global() {
-                    for (name, range) in names.iter().zip(ranges.iter()) {
+                    for name in names {
                        if let Some((scope_id, binding_id)) = self.semantic.nonlocal(name) {
                            // Mark the binding as "used".
                            self.semantic.add_local_reference(
                                binding_id,
-                                *range,
+                                name.range(),
                                ExecutionContext::Runtime,
                            );

@ -297,7 +294,7 @@ where

                            // Add a binding to the current scope.
                            let binding_id = self.semantic.push_binding(
-                                *range,
+                                name.range(),
                                BindingKind::Nonlocal(scope_id),
                                BindingFlags::NONLOCAL,
                            );
@ -309,16 +306,15 @@ where
                                    pylint::rules::NonlocalWithoutBinding {
                                        name: name.to_string(),
                                    },
-                                    *range,
+                                    name.range(),
                                ));
                            }
                        }
                    }
                }
                if self.enabled(Rule::AmbiguousVariableName) {
-                    self.diagnostics
-                        .extend(names.iter().zip(ranges.iter()).filter_map(|(name, range)| {
-                            pycodestyle::rules::ambiguous_variable_name(name, *range)
+                    self.diagnostics.extend(names.iter().filter_map(|name| {
+                        pycodestyle::rules::ambiguous_variable_name(name, name.range())
                    }));
                }
            }
--- a/crates/ruff_python_ast/src/identifier.rs
+++ b/crates/ruff_python_ast/src/identifier.rs
@ -1,29 +1,20 @@
 //! Extract [`TextRange`] information from AST nodes.
 //!
-//! In the `RustPython` AST, each node has a `range` field that contains the
-//! start and end byte offsets of the node. However, attributes on those
-//! nodes may not have their own ranges. In particular, identifiers are
-//! not given their own ranges, unless they're part of a name expression.
-//!
 //! For example, given:
 //! ```python
-//! def f():
+//! try:
+//!     ...
+//! except Exception as e:
 //!     ...
 //! ```
 //!
-//! The statement defining `f` has a range, but the identifier `f` does not.
-//!
-//! This module assists with extracting [`TextRange`] ranges from AST nodes
-//! via manual lexical analysis.
-
-use std::ops::{Add, Sub};
-use std::str::Chars;
+//! This module can be used to identify the [`TextRange`] of the `except` token.

 use ruff_text_size::{TextLen, TextRange, TextSize};
 use rustpython_ast::{Alias, Arg, ArgWithDefault, Pattern};
 use rustpython_parser::ast::{self, ExceptHandler, Ranged, Stmt};

-use ruff_python_whitespace::is_python_whitespace;
+use ruff_python_whitespace::{is_python_whitespace, Cursor};

 use crate::source_code::Locator;

@ -163,20 +154,6 @@ impl TryIdentifier for ExceptHandler {
    }
 }

-/// Return the [`TextRange`] for every name in a [`Stmt`].
-///
-/// Intended to be used for `global` and `nonlocal` statements.
-///
-/// For example, return the ranges of `x` and `y` in:
-/// ```python
-/// global x, y
-/// ```
-pub fn names<'a>(stmt: &Stmt, locator: &'a Locator<'a>) -> impl Iterator<Item = TextRange> + 'a {
-    // Given `global x, y`, the first identifier is `global`, and the remaining identifiers are
-    // the names.
-    IdentifierTokenizer::new(locator.contents(), stmt.range()).skip(1)
-}
-
 /// Return the [`TextRange`] of the `except` token in an [`ExceptHandler`].
 pub fn except(handler: &ExceptHandler, locator: &Locator) -> TextRange {
    IdentifierTokenizer::new(locator.contents(), handler.range())
@ -248,13 +225,15 @@ impl<'a> IdentifierTokenizer<'a> {
    }

    fn next_token(&mut self) -> Option<TextRange> {
-        while let Some(c) = self.cursor.bump() {
+        while let Some(c) = {
+            self.offset += self.cursor.token_len();
+            self.cursor.start_token();
+            self.cursor.bump()
+        } {
            match c {
                c if is_python_identifier_start(c) => {
-                    let start = self.offset.add(self.cursor.offset()).sub(c.text_len());
                    self.cursor.eat_while(is_python_identifier_continue);
-                    let end = self.offset.add(self.cursor.offset());
-                    return Some(TextRange::new(start, end));
+                    return Some(TextRange::at(self.offset, self.cursor.token_len()));
                }

                c if is_python_whitespace(c) => {
@ -295,73 +274,15 @@ impl Iterator for IdentifierTokenizer<'_> {
    }
 }

-const EOF_CHAR: char = '\0';
-
-#[derive(Debug, Clone)]
-struct Cursor<'a> {
-    chars: Chars<'a>,
-    offset: TextSize,
-}
-
-impl<'a> Cursor<'a> {
-    fn new(source: &'a str) -> Self {
-        Self {
-            chars: source.chars(),
-            offset: TextSize::from(0),
-        }
-    }
-
-    const fn offset(&self) -> TextSize {
-        self.offset
-    }
-
-    /// Peeks the next character from the input stream without consuming it.
-    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
-    fn first(&self) -> char {
-        self.chars.clone().next().unwrap_or(EOF_CHAR)
-    }
-
-    /// Returns `true` if the file is at the end of the file.
-    fn is_eof(&self) -> bool {
-        self.chars.as_str().is_empty()
-    }
-
-    /// Consumes the next character.
-    fn bump(&mut self) -> Option<char> {
-        if let Some(char) = self.chars.next() {
-            self.offset += char.text_len();
-            Some(char)
-        } else {
-            None
-        }
-    }
-
-    /// Eats the next character if it matches the given character.
-    fn eat_char(&mut self, c: char) -> bool {
-        if self.first() == c {
-            self.bump();
-            true
-        } else {
-            false
-        }
-    }
-
-    /// Eats symbols while predicate returns true or until the end of file is reached.
-    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
-        while predicate(self.first()) && !self.is_eof() {
-            self.bump();
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use anyhow::Result;
    use ruff_text_size::{TextRange, TextSize};
-    use rustpython_ast::Stmt;
+    use rustpython_ast::{Ranged, Stmt};
    use rustpython_parser::Parse;

    use crate::identifier;
+    use crate::identifier::IdentifierTokenizer;
    use crate::source_code::Locator;

    #[test]
@ -383,4 +304,33 @@ else:
        );
        Ok(())
    }
+
+    #[test]
+    fn extract_global_names() -> Result<()> {
+        let contents = r#"global X,Y, Z"#.trim();
+        let stmt = Stmt::parse(contents, "<filename>")?;
+        let locator = Locator::new(contents);
+
+        let mut names = IdentifierTokenizer::new(locator.contents(), stmt.range());
+
+        let range = names.next_token().unwrap();
+        assert_eq!(&contents[range], "global");
+        assert_eq!(range, TextRange::new(TextSize::from(0), TextSize::from(6)));
+
+        let range = names.next_token().unwrap();
+        assert_eq!(&contents[range], "X");
+        assert_eq!(range, TextRange::new(TextSize::from(7), TextSize::from(8)));
+
+        let range = names.next_token().unwrap();
+        assert_eq!(&contents[range], "Y");
+        assert_eq!(range, TextRange::new(TextSize::from(9), TextSize::from(10)));
+
+        let range = names.next_token().unwrap();
+        assert_eq!(&contents[range], "Z");
+        assert_eq!(
+            range,
+            TextRange::new(TextSize::from(12), TextSize::from(13))
+        );
+        Ok(())
+    }
 }
--- a/crates/ruff_python_formatter/Cargo.toml
+++ b/crates/ruff_python_formatter/Cargo.toml
@ -44,7 +44,6 @@ path = "tests/fixtures.rs"
 test = true
 required-features = [ "serde" ]

-
 [features]
 serde = ["dep:serde", "ruff_formatter/serde"]
 default = ["serde"]
--- a/crates/ruff_python_formatter/src/trivia.rs
+++ b/crates/ruff_python_formatter/src/trivia.rs
@ -1,9 +1,8 @@
-use std::str::Chars;
-
-use ruff_python_whitespace::is_python_whitespace;
 use ruff_text_size::{TextLen, TextRange, TextSize};
 use unic_ucd_ident::{is_xid_continue, is_xid_start};

+use ruff_python_whitespace::{is_python_whitespace, Cursor};
+
 /// Searches for the first non-trivia character in `range`.
 ///
 /// The search skips over any whitespace and comments.
@ -402,9 +401,7 @@ impl<'a> SimpleTokenizer<'a> {

                // Skip the test whether there's a preceding comment if it has been performed before.
                if !self.back_line_has_no_comment {
-                    let rest = self.cursor.chars.as_str();
-
-                    for (back_index, c) in rest.chars().rev().enumerate() {
+                    for (back_index, c) in self.cursor.chars().rev().enumerate() {
                        match c {
                            '#' => {
                                // Potentially a comment
@ -515,100 +512,6 @@ impl DoubleEndedIterator for SimpleTokenizer<'_> {
    }
 }

-const EOF_CHAR: char = '\0';
-
-#[derive(Debug, Clone)]
-struct Cursor<'a> {
-    chars: Chars<'a>,
-    source_length: TextSize,
-}
-
-impl<'a> Cursor<'a> {
-    fn new(source: &'a str) -> Self {
-        Self {
-            source_length: source.text_len(),
-            chars: source.chars(),
-        }
-    }
-
-    /// Peeks the next character from the input stream without consuming it.
-    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
-    fn first(&self) -> char {
-        self.chars.clone().next().unwrap_or(EOF_CHAR)
-    }
-
-    /// Peeks the next character from the input stream without consuming it.
-    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
-    fn last(&self) -> char {
-        self.chars.clone().next_back().unwrap_or(EOF_CHAR)
-    }
-
-    // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
-    #[allow(clippy::cast_possible_truncation)]
-    fn text_len(&self) -> TextSize {
-        TextSize::new(self.chars.as_str().len() as u32)
-    }
-
-    fn token_len(&self) -> TextSize {
-        self.source_length - self.text_len()
-    }
-
-    fn start_token(&mut self) {
-        self.source_length = self.text_len();
-    }
-
-    /// Returns `true` if the file is at the end of the file.
-    fn is_eof(&self) -> bool {
-        self.chars.as_str().is_empty()
-    }
-
-    /// Consumes the next character
-    fn bump(&mut self) -> Option<char> {
-        self.chars.next()
-    }
-
-    /// Consumes the next character from the back
-    fn bump_back(&mut self) -> Option<char> {
-        self.chars.next_back()
-    }
-
-    fn eat_char(&mut self, c: char) -> bool {
-        if self.first() == c {
-            self.bump();
-            true
-        } else {
-            false
-        }
-    }
-
-    fn eat_char_back(&mut self, c: char) -> bool {
-        if self.last() == c {
-            self.bump_back();
-            true
-        } else {
-            false
-        }
-    }
-
-    /// Eats symbols while predicate returns true or until the end of file is reached.
-    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
-        // It was tried making optimized version of this for eg. line comments, but
-        // LLVM can inline all of this and compile it down to fast iteration over bytes.
-        while predicate(self.first()) && !self.is_eof() {
-            self.bump();
-        }
-    }
-
-    /// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
-    fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
-        // It was tried making optimized version of this for eg. line comments, but
-        // LLVM can inline all of this and compile it down to fast iteration over bytes.
-        while predicate(self.last()) && !self.is_eof() {
-            self.bump_back();
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use insta::assert_debug_snapshot;
--- a/crates/ruff_python_whitespace/src/cursor.rs
+++ b/crates/ruff_python_whitespace/src/cursor.rs
@ -0,0 +1,103 @@
+use std::str::Chars;
+
+use ruff_text_size::{TextLen, TextSize};
+
+pub const EOF_CHAR: char = '\0';
+
+/// A [`Cursor`] over a string.
+#[derive(Debug, Clone)]
+pub struct Cursor<'a> {
+    chars: Chars<'a>,
+    source_length: TextSize,
+}
+
+impl<'a> Cursor<'a> {
+    pub fn new(source: &'a str) -> Self {
+        Self {
+            source_length: source.text_len(),
+            chars: source.chars(),
+        }
+    }
+
+    /// Return the remaining input as a string slice.
+    pub fn chars(&self) -> Chars<'a> {
+        self.chars.clone()
+    }
+
+    /// Peeks the next character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    pub fn first(&self) -> char {
+        self.chars.clone().next().unwrap_or(EOF_CHAR)
+    }
+
+    /// Peeks the next character from the input stream without consuming it.
+    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
+    pub fn last(&self) -> char {
+        self.chars.clone().next_back().unwrap_or(EOF_CHAR)
+    }
+
+    // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
+    #[allow(clippy::cast_possible_truncation)]
+    pub fn text_len(&self) -> TextSize {
+        TextSize::new(self.chars.as_str().len() as u32)
+    }
+
+    pub fn token_len(&self) -> TextSize {
+        self.source_length - self.text_len()
+    }
+
+    pub fn start_token(&mut self) {
+        self.source_length = self.text_len();
+    }
+
+    /// Returns `true` if the file is at the end of the file.
+    pub fn is_eof(&self) -> bool {
+        self.chars.as_str().is_empty()
+    }
+
+    /// Consumes the next character
+    pub fn bump(&mut self) -> Option<char> {
+        self.chars.next()
+    }
+
+    /// Consumes the next character from the back
+    pub fn bump_back(&mut self) -> Option<char> {
+        self.chars.next_back()
+    }
+
+    pub fn eat_char(&mut self, c: char) -> bool {
+        if self.first() == c {
+            self.bump();
+            true
+        } else {
+            false
+        }
+    }
+
+    pub fn eat_char_back(&mut self, c: char) -> bool {
+        if self.last() == c {
+            self.bump_back();
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    pub fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.first()) && !self.is_eof() {
+            self.bump();
+        }
+    }
+
+    /// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
+    pub fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.last()) && !self.is_eof() {
+            self.bump_back();
+        }
+    }
+}
--- a/crates/ruff_python_whitespace/src/lib.rs
+++ b/crates/ruff_python_whitespace/src/lib.rs
@ -1,5 +1,7 @@
+mod cursor;
 mod newlines;
 mod whitespace;

+pub use cursor::*;
 pub use newlines::*;
 pub use whitespace::*;