//! Extract [`TextRange`] information from AST nodes. //! //! In the `RustPython` AST, each node has a `range` field that contains the //! start and end byte offsets of the node. However, attributes on those //! nodes may not have their own ranges. In particular, identifiers are //! not given their own ranges, unless they're part of a name expression. //! //! For example, given: //! ```python //! def f(): //! ... //! ``` //! //! The statement defining `f` has a range, but the identifier `f` does not. //! //! This module assists with extracting [`TextRange`] ranges from AST nodes //! via manual lexical analysis. use std::ops::{Add, Sub}; use std::str::Chars; use ruff_text_size::{TextLen, TextRange, TextSize}; use rustpython_ast::{Alias, Arg, ArgWithDefault, Pattern}; use rustpython_parser::ast::{self, ExceptHandler, Ranged, Stmt}; use ruff_python_whitespace::is_python_whitespace; use crate::source_code::Locator; pub trait Identifier { /// Return the [`TextRange`] of the identifier in the given AST node. fn identifier(&self) -> TextRange; } pub trait TryIdentifier { /// Return the [`TextRange`] of the identifier in the given AST node, or `None` if /// the node does not have an identifier. fn try_identifier(&self) -> Option; } impl Identifier for Stmt { /// Return the [`TextRange`] of the identifier in the given statement. /// /// For example, return the range of `f` in: /// ```python /// def f(): /// ... /// ``` fn identifier(&self) -> TextRange { match self { Stmt::ClassDef(ast::StmtClassDef { name, .. }) | Stmt::FunctionDef(ast::StmtFunctionDef { name, .. }) | Stmt::AsyncFunctionDef(ast::StmtAsyncFunctionDef { name, .. }) => name.range(), _ => self.range(), } } } impl Identifier for Arg { /// Return the [`TextRange`] for the identifier defining an [`Arg`]. /// /// For example, return the range of `x` in: /// ```python /// def f(x: int): /// ... /// ``` fn identifier(&self) -> TextRange { self.arg.range() } } impl Identifier for ArgWithDefault { /// Return the [`TextRange`] for the identifier defining an [`ArgWithDefault`]. /// /// For example, return the range of `x` in: /// ```python /// def f(x: int = 0): /// ... /// ``` fn identifier(&self) -> TextRange { self.def.identifier() } } impl Identifier for Alias { /// Return the [`TextRange`] for the identifier defining an [`Alias`]. /// /// For example, return the range of `x` in: /// ```python /// from foo import bar as x /// ``` fn identifier(&self) -> TextRange { self.asname .as_ref() .map_or_else(|| self.name.range(), Ranged::range) } } impl TryIdentifier for Pattern { /// Return the [`TextRange`] of the identifier in the given pattern. /// /// For example, return the range of `z` in: /// ```python /// match x: /// # Pattern::MatchAs /// case z: /// ... /// ``` /// /// Or: /// ```python /// match x: /// # Pattern::MatchAs /// case y as z: /// ... /// ``` /// /// Or : /// ```python /// match x: /// # Pattern::MatchMapping /// case {"a": 1, **z} /// ... /// ``` /// /// Or : /// ```python /// match x: /// # Pattern::MatchStar /// case *z: /// ... /// ``` fn try_identifier(&self) -> Option { let name = match self { Pattern::MatchAs(ast::PatternMatchAs { name: Some(name), .. }) => Some(name), Pattern::MatchMapping(ast::PatternMatchMapping { rest: Some(rest), .. }) => Some(rest), Pattern::MatchStar(ast::PatternMatchStar { name: Some(name), .. }) => Some(name), _ => None, }; name.map(Ranged::range) } } impl TryIdentifier for ExceptHandler { /// Return the [`TextRange`] of a named exception in an [`ExceptHandler`]. /// /// For example, return the range of `e` in: /// ```python /// try: /// ... /// except ValueError as e: /// ... /// ``` fn try_identifier(&self) -> Option { let ExceptHandler::ExceptHandler(ast::ExceptHandlerExceptHandler { name, .. }) = self; name.as_ref().map(Ranged::range) } } /// Return the [`TextRange`] for every name in a [`Stmt`]. /// /// Intended to be used for `global` and `nonlocal` statements. /// /// For example, return the ranges of `x` and `y` in: /// ```python /// global x, y /// ``` pub fn names<'a>(stmt: &Stmt, locator: &'a Locator<'a>) -> impl Iterator + 'a { // Given `global x, y`, the first identifier is `global`, and the remaining identifiers are // the names. IdentifierTokenizer::new(locator.contents(), stmt.range()).skip(1) } /// Return the [`TextRange`] of the `except` token in an [`ExceptHandler`]. pub fn except(handler: &ExceptHandler, locator: &Locator) -> TextRange { IdentifierTokenizer::new(locator.contents(), handler.range()) .next() .expect("Failed to find `except` token in `ExceptHandler`") } /// Return the [`TextRange`] of the `else` token in a `For`, `AsyncFor`, or `While` statement. pub fn else_(stmt: &Stmt, locator: &Locator) -> Option { let (Stmt::For(ast::StmtFor { body, orelse, .. }) | Stmt::AsyncFor(ast::StmtAsyncFor { body, orelse, .. }) | Stmt::While(ast::StmtWhile { body, orelse, .. })) = stmt else { return None; }; if orelse.is_empty() { return None; } IdentifierTokenizer::starts_at( body.last().expect("Expected body to be non-empty").end(), locator.contents(), ) .next() } /// Return `true` if the given character starts a valid Python identifier. /// /// Python identifiers must start with an alphabetic character or an underscore. fn is_python_identifier_start(c: char) -> bool { c.is_alphabetic() || c == '_' } /// Return `true` if the given character is a valid Python identifier continuation character. /// /// Python identifiers can contain alphanumeric characters and underscores, but cannot start with a /// number. fn is_python_identifier_continue(c: char) -> bool { c.is_alphanumeric() || c == '_' } /// Simple zero allocation tokenizer for Python identifiers. /// /// The tokenizer must operate over a range that can only contain identifiers, keywords, and /// comments (along with whitespace and continuation characters). It does not support other tokens, /// like operators, literals, or delimiters. It also does not differentiate between keywords and /// identifiers, treating every valid token as an "identifier". /// /// This is useful for cases like, e.g., identifying the alias name in an aliased import (`bar` in /// `import foo as bar`), where we're guaranteed to only have identifiers and keywords in the /// relevant range. pub(crate) struct IdentifierTokenizer<'a> { cursor: Cursor<'a>, offset: TextSize, } impl<'a> IdentifierTokenizer<'a> { pub(crate) fn new(source: &'a str, range: TextRange) -> Self { Self { cursor: Cursor::new(&source[range]), offset: range.start(), } } pub(crate) fn starts_at(offset: TextSize, source: &'a str) -> Self { let range = TextRange::new(offset, source.text_len()); Self::new(source, range) } fn next_token(&mut self) -> Option { while let Some(c) = self.cursor.bump() { match c { c if is_python_identifier_start(c) => { let start = self.offset.add(self.cursor.offset()).sub(c.text_len()); self.cursor.eat_while(is_python_identifier_continue); let end = self.offset.add(self.cursor.offset()); return Some(TextRange::new(start, end)); } c if is_python_whitespace(c) => { self.cursor.eat_while(is_python_whitespace); } '#' => { self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); } '\r' => { self.cursor.eat_char('\n'); } '\n' => { // Nothing to do. } '\\' => { // Nothing to do. } _ => { // Nothing to do. } }; } None } } impl Iterator for IdentifierTokenizer<'_> { type Item = TextRange; fn next(&mut self) -> Option { self.next_token() } } const EOF_CHAR: char = '\0'; #[derive(Debug, Clone)] struct Cursor<'a> { chars: Chars<'a>, offset: TextSize, } impl<'a> Cursor<'a> { fn new(source: &'a str) -> Self { Self { chars: source.chars(), offset: TextSize::from(0), } } const fn offset(&self) -> TextSize { self.offset } /// Peeks the next character from the input stream without consuming it. /// Returns [`EOF_CHAR`] if the file is at the end of the file. fn first(&self) -> char { self.chars.clone().next().unwrap_or(EOF_CHAR) } /// Returns `true` if the file is at the end of the file. fn is_eof(&self) -> bool { self.chars.as_str().is_empty() } /// Consumes the next character. fn bump(&mut self) -> Option { if let Some(char) = self.chars.next() { self.offset += char.text_len(); Some(char) } else { None } } /// Eats the next character if it matches the given character. fn eat_char(&mut self, c: char) -> bool { if self.first() == c { self.bump(); true } else { false } } /// Eats symbols while predicate returns true or until the end of file is reached. fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { while predicate(self.first()) && !self.is_eof() { self.bump(); } } } #[cfg(test)] mod tests { use anyhow::Result; use ruff_text_size::{TextRange, TextSize}; use rustpython_ast::Stmt; use rustpython_parser::Parse; use crate::identifier; use crate::source_code::Locator; #[test] fn extract_else_range() -> Result<()> { let contents = r#" for x in y: pass else: pass "# .trim(); let stmt = Stmt::parse(contents, "")?; let locator = Locator::new(contents); let range = identifier::else_(&stmt, &locator).unwrap(); assert_eq!(&contents[range], "else"); assert_eq!( range, TextRange::new(TextSize::from(21), TextSize::from(25)) ); Ok(()) } }