mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-24 05:25:17 +00:00
Use shared Cursor
across crates (#5715)
## Summary We have two `Cursor` implementations. This PR moves the implementation from the formatter into `ruff_python_whitespace` (kind of a poorly-named crate now) and uses it for both use-cases.
This commit is contained in:
parent
6ce252f0ed
commit
6dbc6d2e59
6 changed files with 163 additions and 210 deletions
|
@ -18,7 +18,7 @@ use ruff_python_ast::str::trailing_quote;
|
|||
use ruff_python_ast::types::Node;
|
||||
use ruff_python_ast::typing::{parse_type_annotation, AnnotationKind};
|
||||
use ruff_python_ast::visitor::{walk_except_handler, walk_pattern, Visitor};
|
||||
use ruff_python_ast::{cast, helpers, identifier, str, visitor};
|
||||
use ruff_python_ast::{cast, helpers, str, visitor};
|
||||
use ruff_python_semantic::analyze::{branch_detection, typing, visibility};
|
||||
use ruff_python_semantic::{
|
||||
Binding, BindingFlags, BindingId, BindingKind, ContextualizedDefinition, Exceptions,
|
||||
|
@ -251,9 +251,8 @@ where
|
|||
// Pre-visit.
|
||||
match stmt {
|
||||
Stmt::Global(ast::StmtGlobal { names, range: _ }) => {
|
||||
let ranges: Vec<TextRange> = identifier::names(stmt, self.locator).collect();
|
||||
if !self.semantic.scope_id.is_global() {
|
||||
for (name, range) in names.iter().zip(ranges.iter()) {
|
||||
for name in names {
|
||||
if let Some(binding_id) = self.semantic.global_scope().get(name) {
|
||||
// Mark the binding in the global scope as "rebound" in the current scope.
|
||||
self.semantic
|
||||
|
@ -262,7 +261,7 @@ where
|
|||
|
||||
// Add a binding to the current scope.
|
||||
let binding_id = self.semantic.push_binding(
|
||||
*range,
|
||||
name.range(),
|
||||
BindingKind::Global,
|
||||
BindingFlags::GLOBAL,
|
||||
);
|
||||
|
@ -272,21 +271,19 @@ where
|
|||
}
|
||||
|
||||
if self.enabled(Rule::AmbiguousVariableName) {
|
||||
self.diagnostics
|
||||
.extend(names.iter().zip(ranges.iter()).filter_map(|(name, range)| {
|
||||
pycodestyle::rules::ambiguous_variable_name(name, *range)
|
||||
}));
|
||||
self.diagnostics.extend(names.iter().filter_map(|name| {
|
||||
pycodestyle::rules::ambiguous_variable_name(name, name.range())
|
||||
}));
|
||||
}
|
||||
}
|
||||
Stmt::Nonlocal(ast::StmtNonlocal { names, range: _ }) => {
|
||||
let ranges: Vec<TextRange> = identifier::names(stmt, self.locator).collect();
|
||||
if !self.semantic.scope_id.is_global() {
|
||||
for (name, range) in names.iter().zip(ranges.iter()) {
|
||||
for name in names {
|
||||
if let Some((scope_id, binding_id)) = self.semantic.nonlocal(name) {
|
||||
// Mark the binding as "used".
|
||||
self.semantic.add_local_reference(
|
||||
binding_id,
|
||||
*range,
|
||||
name.range(),
|
||||
ExecutionContext::Runtime,
|
||||
);
|
||||
|
||||
|
@ -297,7 +294,7 @@ where
|
|||
|
||||
// Add a binding to the current scope.
|
||||
let binding_id = self.semantic.push_binding(
|
||||
*range,
|
||||
name.range(),
|
||||
BindingKind::Nonlocal(scope_id),
|
||||
BindingFlags::NONLOCAL,
|
||||
);
|
||||
|
@ -309,17 +306,16 @@ where
|
|||
pylint::rules::NonlocalWithoutBinding {
|
||||
name: name.to_string(),
|
||||
},
|
||||
*range,
|
||||
name.range(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if self.enabled(Rule::AmbiguousVariableName) {
|
||||
self.diagnostics
|
||||
.extend(names.iter().zip(ranges.iter()).filter_map(|(name, range)| {
|
||||
pycodestyle::rules::ambiguous_variable_name(name, *range)
|
||||
}));
|
||||
self.diagnostics.extend(names.iter().filter_map(|name| {
|
||||
pycodestyle::rules::ambiguous_variable_name(name, name.range())
|
||||
}));
|
||||
}
|
||||
}
|
||||
Stmt::Break(_) => {
|
||||
|
|
|
@ -1,29 +1,20 @@
|
|||
//! Extract [`TextRange`] information from AST nodes.
|
||||
//!
|
||||
//! In the `RustPython` AST, each node has a `range` field that contains the
|
||||
//! start and end byte offsets of the node. However, attributes on those
|
||||
//! nodes may not have their own ranges. In particular, identifiers are
|
||||
//! not given their own ranges, unless they're part of a name expression.
|
||||
//!
|
||||
//! For example, given:
|
||||
//! ```python
|
||||
//! def f():
|
||||
//! try:
|
||||
//! ...
|
||||
//! except Exception as e:
|
||||
//! ...
|
||||
//! ```
|
||||
//!
|
||||
//! The statement defining `f` has a range, but the identifier `f` does not.
|
||||
//!
|
||||
//! This module assists with extracting [`TextRange`] ranges from AST nodes
|
||||
//! via manual lexical analysis.
|
||||
|
||||
use std::ops::{Add, Sub};
|
||||
use std::str::Chars;
|
||||
//! This module can be used to identify the [`TextRange`] of the `except` token.
|
||||
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
use rustpython_ast::{Alias, Arg, ArgWithDefault, Pattern};
|
||||
use rustpython_parser::ast::{self, ExceptHandler, Ranged, Stmt};
|
||||
|
||||
use ruff_python_whitespace::is_python_whitespace;
|
||||
use ruff_python_whitespace::{is_python_whitespace, Cursor};
|
||||
|
||||
use crate::source_code::Locator;
|
||||
|
||||
|
@ -163,20 +154,6 @@ impl TryIdentifier for ExceptHandler {
|
|||
}
|
||||
}
|
||||
|
||||
/// Return the [`TextRange`] for every name in a [`Stmt`].
|
||||
///
|
||||
/// Intended to be used for `global` and `nonlocal` statements.
|
||||
///
|
||||
/// For example, return the ranges of `x` and `y` in:
|
||||
/// ```python
|
||||
/// global x, y
|
||||
/// ```
|
||||
pub fn names<'a>(stmt: &Stmt, locator: &'a Locator<'a>) -> impl Iterator<Item = TextRange> + 'a {
|
||||
// Given `global x, y`, the first identifier is `global`, and the remaining identifiers are
|
||||
// the names.
|
||||
IdentifierTokenizer::new(locator.contents(), stmt.range()).skip(1)
|
||||
}
|
||||
|
||||
/// Return the [`TextRange`] of the `except` token in an [`ExceptHandler`].
|
||||
pub fn except(handler: &ExceptHandler, locator: &Locator) -> TextRange {
|
||||
IdentifierTokenizer::new(locator.contents(), handler.range())
|
||||
|
@ -248,13 +225,15 @@ impl<'a> IdentifierTokenizer<'a> {
|
|||
}
|
||||
|
||||
fn next_token(&mut self) -> Option<TextRange> {
|
||||
while let Some(c) = self.cursor.bump() {
|
||||
while let Some(c) = {
|
||||
self.offset += self.cursor.token_len();
|
||||
self.cursor.start_token();
|
||||
self.cursor.bump()
|
||||
} {
|
||||
match c {
|
||||
c if is_python_identifier_start(c) => {
|
||||
let start = self.offset.add(self.cursor.offset()).sub(c.text_len());
|
||||
self.cursor.eat_while(is_python_identifier_continue);
|
||||
let end = self.offset.add(self.cursor.offset());
|
||||
return Some(TextRange::new(start, end));
|
||||
return Some(TextRange::at(self.offset, self.cursor.token_len()));
|
||||
}
|
||||
|
||||
c if is_python_whitespace(c) => {
|
||||
|
@ -295,73 +274,15 @@ impl Iterator for IdentifierTokenizer<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
const EOF_CHAR: char = '\0';
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Cursor<'a> {
|
||||
chars: Chars<'a>,
|
||||
offset: TextSize,
|
||||
}
|
||||
|
||||
impl<'a> Cursor<'a> {
|
||||
fn new(source: &'a str) -> Self {
|
||||
Self {
|
||||
chars: source.chars(),
|
||||
offset: TextSize::from(0),
|
||||
}
|
||||
}
|
||||
|
||||
const fn offset(&self) -> TextSize {
|
||||
self.offset
|
||||
}
|
||||
|
||||
/// Peeks the next character from the input stream without consuming it.
|
||||
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
||||
fn first(&self) -> char {
|
||||
self.chars.clone().next().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
/// Returns `true` if the file is at the end of the file.
|
||||
fn is_eof(&self) -> bool {
|
||||
self.chars.as_str().is_empty()
|
||||
}
|
||||
|
||||
/// Consumes the next character.
|
||||
fn bump(&mut self) -> Option<char> {
|
||||
if let Some(char) = self.chars.next() {
|
||||
self.offset += char.text_len();
|
||||
Some(char)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats the next character if it matches the given character.
|
||||
fn eat_char(&mut self, c: char) -> bool {
|
||||
if self.first() == c {
|
||||
self.bump();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats symbols while predicate returns true or until the end of file is reached.
|
||||
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||
while predicate(self.first()) && !self.is_eof() {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use anyhow::Result;
|
||||
use ruff_text_size::{TextRange, TextSize};
|
||||
use rustpython_ast::Stmt;
|
||||
use rustpython_ast::{Ranged, Stmt};
|
||||
use rustpython_parser::Parse;
|
||||
|
||||
use crate::identifier;
|
||||
use crate::identifier::IdentifierTokenizer;
|
||||
use crate::source_code::Locator;
|
||||
|
||||
#[test]
|
||||
|
@ -383,4 +304,33 @@ else:
|
|||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_global_names() -> Result<()> {
|
||||
let contents = r#"global X,Y, Z"#.trim();
|
||||
let stmt = Stmt::parse(contents, "<filename>")?;
|
||||
let locator = Locator::new(contents);
|
||||
|
||||
let mut names = IdentifierTokenizer::new(locator.contents(), stmt.range());
|
||||
|
||||
let range = names.next_token().unwrap();
|
||||
assert_eq!(&contents[range], "global");
|
||||
assert_eq!(range, TextRange::new(TextSize::from(0), TextSize::from(6)));
|
||||
|
||||
let range = names.next_token().unwrap();
|
||||
assert_eq!(&contents[range], "X");
|
||||
assert_eq!(range, TextRange::new(TextSize::from(7), TextSize::from(8)));
|
||||
|
||||
let range = names.next_token().unwrap();
|
||||
assert_eq!(&contents[range], "Y");
|
||||
assert_eq!(range, TextRange::new(TextSize::from(9), TextSize::from(10)));
|
||||
|
||||
let range = names.next_token().unwrap();
|
||||
assert_eq!(&contents[range], "Z");
|
||||
assert_eq!(
|
||||
range,
|
||||
TextRange::new(TextSize::from(12), TextSize::from(13))
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,7 +44,6 @@ path = "tests/fixtures.rs"
|
|||
test = true
|
||||
required-features = [ "serde" ]
|
||||
|
||||
|
||||
[features]
|
||||
serde = ["dep:serde", "ruff_formatter/serde"]
|
||||
default = ["serde"]
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
use std::str::Chars;
|
||||
|
||||
use ruff_python_whitespace::is_python_whitespace;
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
use unic_ucd_ident::{is_xid_continue, is_xid_start};
|
||||
|
||||
use ruff_python_whitespace::{is_python_whitespace, Cursor};
|
||||
|
||||
/// Searches for the first non-trivia character in `range`.
|
||||
///
|
||||
/// The search skips over any whitespace and comments.
|
||||
|
@ -402,9 +401,7 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
|
||||
// Skip the test whether there's a preceding comment if it has been performed before.
|
||||
if !self.back_line_has_no_comment {
|
||||
let rest = self.cursor.chars.as_str();
|
||||
|
||||
for (back_index, c) in rest.chars().rev().enumerate() {
|
||||
for (back_index, c) in self.cursor.chars().rev().enumerate() {
|
||||
match c {
|
||||
'#' => {
|
||||
// Potentially a comment
|
||||
|
@ -515,100 +512,6 @@ impl DoubleEndedIterator for SimpleTokenizer<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
const EOF_CHAR: char = '\0';
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Cursor<'a> {
|
||||
chars: Chars<'a>,
|
||||
source_length: TextSize,
|
||||
}
|
||||
|
||||
impl<'a> Cursor<'a> {
|
||||
fn new(source: &'a str) -> Self {
|
||||
Self {
|
||||
source_length: source.text_len(),
|
||||
chars: source.chars(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Peeks the next character from the input stream without consuming it.
|
||||
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
||||
fn first(&self) -> char {
|
||||
self.chars.clone().next().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
/// Peeks the next character from the input stream without consuming it.
|
||||
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
||||
fn last(&self) -> char {
|
||||
self.chars.clone().next_back().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
// SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
fn text_len(&self) -> TextSize {
|
||||
TextSize::new(self.chars.as_str().len() as u32)
|
||||
}
|
||||
|
||||
fn token_len(&self) -> TextSize {
|
||||
self.source_length - self.text_len()
|
||||
}
|
||||
|
||||
fn start_token(&mut self) {
|
||||
self.source_length = self.text_len();
|
||||
}
|
||||
|
||||
/// Returns `true` if the file is at the end of the file.
|
||||
fn is_eof(&self) -> bool {
|
||||
self.chars.as_str().is_empty()
|
||||
}
|
||||
|
||||
/// Consumes the next character
|
||||
fn bump(&mut self) -> Option<char> {
|
||||
self.chars.next()
|
||||
}
|
||||
|
||||
/// Consumes the next character from the back
|
||||
fn bump_back(&mut self) -> Option<char> {
|
||||
self.chars.next_back()
|
||||
}
|
||||
|
||||
fn eat_char(&mut self, c: char) -> bool {
|
||||
if self.first() == c {
|
||||
self.bump();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn eat_char_back(&mut self, c: char) -> bool {
|
||||
if self.last() == c {
|
||||
self.bump_back();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats symbols while predicate returns true or until the end of file is reached.
|
||||
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||
// It was tried making optimized version of this for eg. line comments, but
|
||||
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
||||
while predicate(self.first()) && !self.is_eof() {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
|
||||
fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||
// It was tried making optimized version of this for eg. line comments, but
|
||||
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
||||
while predicate(self.last()) && !self.is_eof() {
|
||||
self.bump_back();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use insta::assert_debug_snapshot;
|
||||
|
|
103
crates/ruff_python_whitespace/src/cursor.rs
Normal file
103
crates/ruff_python_whitespace/src/cursor.rs
Normal file
|
@ -0,0 +1,103 @@
|
|||
use std::str::Chars;
|
||||
|
||||
use ruff_text_size::{TextLen, TextSize};
|
||||
|
||||
pub const EOF_CHAR: char = '\0';
|
||||
|
||||
/// A [`Cursor`] over a string.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Cursor<'a> {
|
||||
chars: Chars<'a>,
|
||||
source_length: TextSize,
|
||||
}
|
||||
|
||||
impl<'a> Cursor<'a> {
|
||||
pub fn new(source: &'a str) -> Self {
|
||||
Self {
|
||||
source_length: source.text_len(),
|
||||
chars: source.chars(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the remaining input as a string slice.
|
||||
pub fn chars(&self) -> Chars<'a> {
|
||||
self.chars.clone()
|
||||
}
|
||||
|
||||
/// Peeks the next character from the input stream without consuming it.
|
||||
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
||||
pub fn first(&self) -> char {
|
||||
self.chars.clone().next().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
/// Peeks the next character from the input stream without consuming it.
|
||||
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
||||
pub fn last(&self) -> char {
|
||||
self.chars.clone().next_back().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
// SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
pub fn text_len(&self) -> TextSize {
|
||||
TextSize::new(self.chars.as_str().len() as u32)
|
||||
}
|
||||
|
||||
pub fn token_len(&self) -> TextSize {
|
||||
self.source_length - self.text_len()
|
||||
}
|
||||
|
||||
pub fn start_token(&mut self) {
|
||||
self.source_length = self.text_len();
|
||||
}
|
||||
|
||||
/// Returns `true` if the file is at the end of the file.
|
||||
pub fn is_eof(&self) -> bool {
|
||||
self.chars.as_str().is_empty()
|
||||
}
|
||||
|
||||
/// Consumes the next character
|
||||
pub fn bump(&mut self) -> Option<char> {
|
||||
self.chars.next()
|
||||
}
|
||||
|
||||
/// Consumes the next character from the back
|
||||
pub fn bump_back(&mut self) -> Option<char> {
|
||||
self.chars.next_back()
|
||||
}
|
||||
|
||||
pub fn eat_char(&mut self, c: char) -> bool {
|
||||
if self.first() == c {
|
||||
self.bump();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn eat_char_back(&mut self, c: char) -> bool {
|
||||
if self.last() == c {
|
||||
self.bump_back();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats symbols while predicate returns true or until the end of file is reached.
|
||||
pub fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||
// It was tried making optimized version of this for eg. line comments, but
|
||||
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
||||
while predicate(self.first()) && !self.is_eof() {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
|
||||
pub fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||
// It was tried making optimized version of this for eg. line comments, but
|
||||
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
||||
while predicate(self.last()) && !self.is_eof() {
|
||||
self.bump_back();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,5 +1,7 @@
|
|||
mod cursor;
|
||||
mod newlines;
|
||||
mod whitespace;
|
||||
|
||||
pub use cursor::*;
|
||||
pub use newlines::*;
|
||||
pub use whitespace::*;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue