ruff/crates/ruff_python_ast/src/identifier.rs
konsti 14e65afdc6
Update to Rust 1.74 and use new clippy lints table (#8722)
Update to [Rust
1.74](https://blog.rust-lang.org/2023/11/16/Rust-1.74.0.html) and use
the new clippy lints table.

The update itself introduced a new clippy lint about superfluous hashes
in raw strings, which got removed.

I moved our lint config from `rustflags` to the newly stabilized
[workspace.lints](https://doc.rust-lang.org/stable/cargo/reference/workspaces.html#the-lints-table).
One consequence is that we have to `unsafe_code = "warn"` instead of
"forbid" because the latter now actually bans unsafe code:

```
error[E0453]: allow(unsafe_code) incompatible with previous forbid
  --> crates/ruff_source_file/src/newlines.rs:62:17
   |
62 |         #[allow(unsafe_code)]
   |                 ^^^^^^^^^^^ overruled by previous forbid
   |
   = note: `forbid` lint level was set on command line
```

---------

Co-authored-by: Charlie Marsh <charlie.r.marsh@gmail.com>
2023-11-16 18:12:46 -05:00

258 lines
7.4 KiB
Rust

//! Extract [`TextRange`] information from AST nodes.
//!
//! For example, given:
//! ```python
//! try:
//! ...
//! except Exception as e:
//! ...
//! ```
//!
//! This module can be used to identify the [`TextRange`] of the `except` token.
use crate::{self as ast, Alias, ExceptHandler, Parameter, ParameterWithDefault, Stmt};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use ruff_python_trivia::{is_python_whitespace, Cursor};
pub trait Identifier {
/// Return the [`TextRange`] of the identifier in the given AST node.
fn identifier(&self) -> TextRange;
}
impl Identifier for ast::StmtFunctionDef {
/// Return the [`TextRange`] of the identifier in the given function definition.
///
/// For example, return the range of `f` in:
/// ```python
/// def f():
/// ...
/// ```
fn identifier(&self) -> TextRange {
self.name.range()
}
}
impl Identifier for ast::StmtClassDef {
/// Return the [`TextRange`] of the identifier in the given class definition.
///
/// For example, return the range of `C` in:
/// ```python
/// class C():
/// ...
/// ```
fn identifier(&self) -> TextRange {
self.name.range()
}
}
impl Identifier for Stmt {
/// Return the [`TextRange`] of the identifier in the given statement.
///
/// For example, return the range of `f` in:
/// ```python
/// def f():
/// ...
/// ```
fn identifier(&self) -> TextRange {
match self {
Stmt::ClassDef(class) => class.identifier(),
Stmt::FunctionDef(function) => function.identifier(),
_ => self.range(),
}
}
}
impl Identifier for Parameter {
/// Return the [`TextRange`] for the identifier defining an [`Parameter`].
///
/// For example, return the range of `x` in:
/// ```python
/// def f(x: int):
/// ...
/// ```
fn identifier(&self) -> TextRange {
self.name.range()
}
}
impl Identifier for ParameterWithDefault {
/// Return the [`TextRange`] for the identifier defining an [`ParameterWithDefault`].
///
/// For example, return the range of `x` in:
/// ```python
/// def f(x: int = 0):
/// ...
/// ```
fn identifier(&self) -> TextRange {
self.parameter.identifier()
}
}
impl Identifier for Alias {
/// Return the [`TextRange`] for the identifier defining an [`Alias`].
///
/// For example, return the range of `x` in:
/// ```python
/// from foo import bar as x
/// ```
fn identifier(&self) -> TextRange {
self.asname
.as_ref()
.map_or_else(|| self.name.range(), Ranged::range)
}
}
/// Return the [`TextRange`] of the `except` token in an [`ExceptHandler`].
pub fn except(handler: &ExceptHandler, source: &str) -> TextRange {
IdentifierTokenizer::new(source, handler.range())
.next()
.expect("Failed to find `except` token in `ExceptHandler`")
}
/// Return the [`TextRange`] of the `else` token in a `For` or `While` statement.
pub fn else_(stmt: &Stmt, source: &str) -> Option<TextRange> {
let (Stmt::For(ast::StmtFor { body, orelse, .. })
| Stmt::While(ast::StmtWhile { body, orelse, .. })) = stmt
else {
return None;
};
if orelse.is_empty() {
return None;
}
IdentifierTokenizer::starts_at(
body.last().expect("Expected body to be non-empty").end(),
source,
)
.next()
}
/// Return `true` if the given character starts a valid Python identifier.
///
/// Python identifiers must start with an alphabetic character or an underscore.
fn is_python_identifier_start(c: char) -> bool {
c.is_alphabetic() || c == '_'
}
/// Return `true` if the given character is a valid Python identifier continuation character.
///
/// Python identifiers can contain alphanumeric characters and underscores, but cannot start with a
/// number.
fn is_python_identifier_continue(c: char) -> bool {
c.is_alphanumeric() || c == '_'
}
/// Simple zero allocation tokenizer for Python identifiers.
///
/// The tokenizer must operate over a range that can only contain identifiers, keywords, and
/// comments (along with whitespace and continuation characters). It does not support other tokens,
/// like operators, literals, or delimiters. It also does not differentiate between keywords and
/// identifiers, treating every valid token as an "identifier".
///
/// This is useful for cases like, e.g., identifying the alias name in an aliased import (`bar` in
/// `import foo as bar`), where we're guaranteed to only have identifiers and keywords in the
/// relevant range.
pub(crate) struct IdentifierTokenizer<'a> {
cursor: Cursor<'a>,
offset: TextSize,
}
impl<'a> IdentifierTokenizer<'a> {
pub(crate) fn new(source: &'a str, range: TextRange) -> Self {
Self {
cursor: Cursor::new(&source[range]),
offset: range.start(),
}
}
pub(crate) fn starts_at(offset: TextSize, source: &'a str) -> Self {
let range = TextRange::new(offset, source.text_len());
Self::new(source, range)
}
fn next_token(&mut self) -> Option<TextRange> {
while let Some(c) = {
self.offset += self.cursor.token_len();
self.cursor.start_token();
self.cursor.bump()
} {
match c {
c if is_python_identifier_start(c) => {
self.cursor.eat_while(is_python_identifier_continue);
return Some(TextRange::at(self.offset, self.cursor.token_len()));
}
c if is_python_whitespace(c) => {
self.cursor.eat_while(is_python_whitespace);
}
'#' => {
self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
}
'\r' => {
self.cursor.eat_char('\n');
}
'\n' => {
// Nothing to do.
}
'\\' => {
// Nothing to do.
}
_ => {
// Nothing to do.
}
};
}
None
}
}
impl Iterator for IdentifierTokenizer<'_> {
type Item = TextRange;
fn next(&mut self) -> Option<Self::Item> {
self.next_token()
}
}
#[cfg(test)]
mod tests {
use super::IdentifierTokenizer;
use ruff_text_size::{TextLen, TextRange, TextSize};
#[test]
fn extract_global_names() {
let contents = r"global X,Y, Z".trim();
let mut names = IdentifierTokenizer::new(
contents,
TextRange::new(TextSize::new(0), contents.text_len()),
);
let range = names.next_token().unwrap();
assert_eq!(&contents[range], "global");
assert_eq!(range, TextRange::new(TextSize::from(0), TextSize::from(6)));
let range = names.next_token().unwrap();
assert_eq!(&contents[range], "X");
assert_eq!(range, TextRange::new(TextSize::from(7), TextSize::from(8)));
let range = names.next_token().unwrap();
assert_eq!(&contents[range], "Y");
assert_eq!(range, TextRange::new(TextSize::from(9), TextSize::from(10)));
let range = names.next_token().unwrap();
assert_eq!(&contents[range], "Z");
assert_eq!(
range,
TextRange::new(TextSize::from(12), TextSize::from(13))
);
}
}