mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-06 11:48:08 +00:00

## Summary Garbage collect ASTs once we are done checking a given file. Queries with a cross-file dependency on the AST will reparse the file on demand. This reduces ty's peak memory usage by ~20-30%. The primary change of this PR is adding a `node_index` field to every AST node, that is assigned by the parser. `ParsedModule` can use this to create a flat index of AST nodes any time the file is parsed (or reparsed). This allows `AstNodeRef` to simply index into the current instance of the `ParsedModule`, instead of storing a pointer directly. The indices are somewhat hackily (using an atomic integer) assigned by the `parsed_module` query instead of by the parser directly. Assigning the indices in source-order in the (recursive) parser turns out to be difficult, and collecting the nodes during semantic indexing is impossible as `SemanticIndex` does not hold onto a specific `ParsedModuleRef`, which the pointers in the flat AST are tied to. This means that we have to do an extra AST traversal to assign and collect the nodes into a flat index, but the small performance impact (~3% on cold runs) seems worth it for the memory savings. Part of https://github.com/astral-sh/ty/issues/214.
1107 lines
39 KiB
Rust
1107 lines
39 KiB
Rust
//! This crate can be used to parse Python source code into an Abstract
|
|
//! Syntax Tree.
|
|
//!
|
|
//! ## Overview
|
|
//!
|
|
//! The process by which source code is parsed into an AST can be broken down
|
|
//! into two general stages: [lexical analysis] and [parsing].
|
|
//!
|
|
//! During lexical analysis, the source code is converted into a stream of lexical
|
|
//! tokens that represent the smallest meaningful units of the language. For example,
|
|
//! the source code `print("Hello world")` would _roughly_ be converted into the following
|
|
//! stream of tokens:
|
|
//!
|
|
//! ```text
|
|
//! Name("print"), LeftParen, String("Hello world"), RightParen
|
|
//! ```
|
|
//!
|
|
//! These tokens are then consumed by the `ruff_python_parser`, which matches them against a set of
|
|
//! grammar rules to verify that the source code is syntactically valid and to construct
|
|
//! an AST that represents the source code.
|
|
//!
|
|
//! During parsing, the `ruff_python_parser` consumes the tokens generated by the lexer and constructs
|
|
//! a tree representation of the source code. The tree is made up of nodes that represent
|
|
//! the different syntactic constructs of the language. If the source code is syntactically
|
|
//! invalid, parsing fails and an error is returned. After a successful parse, the AST can
|
|
//! be used to perform further analysis on the source code. Continuing with the example
|
|
//! above, the AST generated by the `ruff_python_parser` would _roughly_ look something like this:
|
|
//!
|
|
//! ```text
|
|
//! node: Expr {
|
|
//! value: {
|
|
//! node: Call {
|
|
//! func: {
|
|
//! node: Name {
|
|
//! id: "print",
|
|
//! ctx: Load,
|
|
//! },
|
|
//! },
|
|
//! args: [
|
|
//! node: Constant {
|
|
//! value: Str("Hello World"),
|
|
//! kind: None,
|
|
//! },
|
|
//! ],
|
|
//! keywords: [],
|
|
//! },
|
|
//! },
|
|
//! },
|
|
//!```
|
|
//!
|
|
//! **Note:** The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the `ruff_python_parser`.
|
|
//! Refer to the [playground](https://play.ruff.rs) for the correct representation.
|
|
//!
|
|
//! ## Source code layout
|
|
//!
|
|
//! The functionality of this crate is split into several modules:
|
|
//!
|
|
//! - token: This module contains the definition of the tokens that are generated by the lexer.
|
|
//! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
|
|
//! - parser: This module contains an interface to the [Parsed] and is responsible for generating the AST.
|
|
//! - mode: This module contains the definition of the different modes that the `ruff_python_parser` can be in.
|
|
//!
|
|
//! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
|
|
//! [parsing]: https://en.wikipedia.org/wiki/Parsing
|
|
//! [lexer]: crate::lexer
|
|
use std::iter::FusedIterator;
|
|
use std::ops::Deref;
|
|
|
|
pub use crate::error::{
|
|
InterpolatedStringErrorType, LexicalErrorType, ParseError, ParseErrorType,
|
|
UnsupportedSyntaxError, UnsupportedSyntaxErrorKind,
|
|
};
|
|
pub use crate::parser::ParseOptions;
|
|
pub use crate::token::{Token, TokenKind};
|
|
|
|
use crate::parser::Parser;
|
|
|
|
use ruff_python_ast::{
|
|
Expr, Mod, ModExpression, ModModule, PySourceType, StringFlags, StringLiteral, Suite,
|
|
};
|
|
use ruff_python_trivia::CommentRanges;
|
|
use ruff_text_size::{Ranged, TextRange, TextSize};
|
|
|
|
mod error;
|
|
pub mod lexer;
|
|
mod parser;
|
|
pub mod semantic_errors;
|
|
mod string;
|
|
mod token;
|
|
mod token_set;
|
|
mod token_source;
|
|
pub mod typing;
|
|
|
|
/// Parse a full Python module usually consisting of multiple lines.
|
|
///
|
|
/// This is a convenience function that can be used to parse a full Python program without having to
|
|
/// specify the [`Mode`] or the location. It is probably what you want to use most of the time.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// For example, parsing a simple function definition and a call to that function:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::parse_module;
|
|
///
|
|
/// let source = r#"
|
|
/// def foo():
|
|
/// return 42
|
|
///
|
|
/// print(foo())
|
|
/// "#;
|
|
///
|
|
/// let module = parse_module(source);
|
|
/// assert!(module.is_ok());
|
|
/// ```
|
|
pub fn parse_module(source: &str) -> Result<Parsed<ModModule>, ParseError> {
|
|
Parser::new(source, ParseOptions::from(Mode::Module))
|
|
.parse()
|
|
.try_into_module()
|
|
.unwrap()
|
|
.into_result()
|
|
}
|
|
|
|
/// Parses a single Python expression.
|
|
///
|
|
/// This convenience function can be used to parse a single expression without having to
|
|
/// specify the Mode or the location.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// For example, parsing a single expression denoting the addition of two numbers:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::parse_expression;
|
|
///
|
|
/// let expr = parse_expression("1 + 2");
|
|
/// assert!(expr.is_ok());
|
|
/// ```
|
|
pub fn parse_expression(source: &str) -> Result<Parsed<ModExpression>, ParseError> {
|
|
Parser::new(source, ParseOptions::from(Mode::Expression))
|
|
.parse()
|
|
.try_into_expression()
|
|
.unwrap()
|
|
.into_result()
|
|
}
|
|
|
|
/// Parses a Python expression for the given range in the source.
|
|
///
|
|
/// This function allows to specify the range of the expression in the source code, other than
|
|
/// that, it behaves exactly like [`parse_expression`].
|
|
///
|
|
/// # Example
|
|
///
|
|
/// Parsing one of the numeric literal which is part of an addition expression:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::parse_expression_range;
|
|
/// # use ruff_text_size::{TextRange, TextSize};
|
|
///
|
|
/// let parsed = parse_expression_range("11 + 22 + 33", TextRange::new(TextSize::new(5), TextSize::new(7)));
|
|
/// assert!(parsed.is_ok());
|
|
/// ```
|
|
pub fn parse_expression_range(
|
|
source: &str,
|
|
range: TextRange,
|
|
) -> Result<Parsed<ModExpression>, ParseError> {
|
|
let source = &source[..range.end().to_usize()];
|
|
Parser::new_starts_at(source, range.start(), ParseOptions::from(Mode::Expression))
|
|
.parse()
|
|
.try_into_expression()
|
|
.unwrap()
|
|
.into_result()
|
|
}
|
|
|
|
/// Parses a Python expression as if it is parenthesized.
|
|
///
|
|
/// It behaves similarly to [`parse_expression_range`] but allows what would be valid within parenthesis
|
|
///
|
|
/// # Example
|
|
///
|
|
/// Parsing an expression that would be valid within parenthesis:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::parse_parenthesized_expression_range;
|
|
/// # use ruff_text_size::{TextRange, TextSize};
|
|
///
|
|
/// let parsed = parse_parenthesized_expression_range("'''\n int | str'''", TextRange::new(TextSize::new(3), TextSize::new(14)));
|
|
/// assert!(parsed.is_ok());
|
|
pub fn parse_parenthesized_expression_range(
|
|
source: &str,
|
|
range: TextRange,
|
|
) -> Result<Parsed<ModExpression>, ParseError> {
|
|
let source = &source[..range.end().to_usize()];
|
|
let parsed = Parser::new_starts_at(
|
|
source,
|
|
range.start(),
|
|
ParseOptions::from(Mode::ParenthesizedExpression),
|
|
)
|
|
.parse();
|
|
parsed.try_into_expression().unwrap().into_result()
|
|
}
|
|
|
|
/// Parses a Python expression from a string annotation.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// Parsing a string annotation:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::parse_string_annotation;
|
|
/// use ruff_python_ast::{StringLiteral, StringLiteralFlags, AtomicNodeIndex};
|
|
/// use ruff_text_size::{TextRange, TextSize};
|
|
///
|
|
/// let string = StringLiteral {
|
|
/// value: "'''\n int | str'''".to_string().into_boxed_str(),
|
|
/// flags: StringLiteralFlags::empty(),
|
|
/// range: TextRange::new(TextSize::new(0), TextSize::new(16)),
|
|
/// node_index: AtomicNodeIndex::dummy()
|
|
/// };
|
|
/// let parsed = parse_string_annotation("'''\n int | str'''", &string);
|
|
/// assert!(!parsed.is_ok());
|
|
/// ```
|
|
pub fn parse_string_annotation(
|
|
source: &str,
|
|
string: &StringLiteral,
|
|
) -> Result<Parsed<ModExpression>, ParseError> {
|
|
let range = string
|
|
.range()
|
|
.add_start(string.flags.opener_len())
|
|
.sub_end(string.flags.closer_len());
|
|
let source = &source[..range.end().to_usize()];
|
|
if string.flags.is_triple_quoted() {
|
|
parse_parenthesized_expression_range(source, range)
|
|
} else {
|
|
parse_expression_range(source, range)
|
|
}
|
|
}
|
|
|
|
/// Parse the given Python source code using the specified [`ParseOptions`].
|
|
///
|
|
/// This function is the most general function to parse Python code. Based on the [`Mode`] supplied
|
|
/// via the [`ParseOptions`], it can be used to parse a single expression, a full Python program,
|
|
/// an interactive expression or a Python program containing IPython escape commands.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// If we want to parse a simple expression, we can use the [`Mode::Expression`] mode during
|
|
/// parsing:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::{parse, Mode, ParseOptions};
|
|
///
|
|
/// let parsed = parse("1 + 2", ParseOptions::from(Mode::Expression));
|
|
/// assert!(parsed.is_ok());
|
|
/// ```
|
|
///
|
|
/// Alternatively, we can parse a full Python program consisting of multiple lines:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::{parse, Mode, ParseOptions};
|
|
///
|
|
/// let source = r#"
|
|
/// class Greeter:
|
|
///
|
|
/// def greet(self):
|
|
/// print("Hello, world!")
|
|
/// "#;
|
|
/// let parsed = parse(source, ParseOptions::from(Mode::Module));
|
|
/// assert!(parsed.is_ok());
|
|
/// ```
|
|
///
|
|
/// Additionally, we can parse a Python program containing IPython escapes:
|
|
///
|
|
/// ```
|
|
/// use ruff_python_parser::{parse, Mode, ParseOptions};
|
|
///
|
|
/// let source = r#"
|
|
/// %timeit 1 + 2
|
|
/// ?str.replace
|
|
/// !ls
|
|
/// "#;
|
|
/// let parsed = parse(source, ParseOptions::from(Mode::Ipython));
|
|
/// assert!(parsed.is_ok());
|
|
/// ```
|
|
pub fn parse(source: &str, options: ParseOptions) -> Result<Parsed<Mod>, ParseError> {
|
|
parse_unchecked(source, options).into_result()
|
|
}
|
|
|
|
/// Parse the given Python source code using the specified [`ParseOptions`].
|
|
///
|
|
/// This is same as the [`parse`] function except that it doesn't check for any [`ParseError`]
|
|
/// and returns the [`Parsed`] as is.
|
|
pub fn parse_unchecked(source: &str, options: ParseOptions) -> Parsed<Mod> {
|
|
Parser::new(source, options).parse()
|
|
}
|
|
|
|
/// Parse the given Python source code using the specified [`PySourceType`].
|
|
pub fn parse_unchecked_source(source: &str, source_type: PySourceType) -> Parsed<ModModule> {
|
|
// SAFETY: Safe because `PySourceType` always parses to a `ModModule`
|
|
Parser::new(source, ParseOptions::from(source_type))
|
|
.parse()
|
|
.try_into_module()
|
|
.unwrap()
|
|
}
|
|
|
|
/// Represents the parsed source code.
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
pub struct Parsed<T> {
|
|
syntax: T,
|
|
tokens: Tokens,
|
|
errors: Vec<ParseError>,
|
|
unsupported_syntax_errors: Vec<UnsupportedSyntaxError>,
|
|
}
|
|
|
|
impl<T> Parsed<T> {
|
|
/// Returns the syntax node represented by this parsed output.
|
|
pub fn syntax(&self) -> &T {
|
|
&self.syntax
|
|
}
|
|
|
|
/// Returns all the tokens for the parsed output.
|
|
pub fn tokens(&self) -> &Tokens {
|
|
&self.tokens
|
|
}
|
|
|
|
/// Returns a list of syntax errors found during parsing.
|
|
pub fn errors(&self) -> &[ParseError] {
|
|
&self.errors
|
|
}
|
|
|
|
/// Returns a list of version-related syntax errors found during parsing.
|
|
pub fn unsupported_syntax_errors(&self) -> &[UnsupportedSyntaxError] {
|
|
&self.unsupported_syntax_errors
|
|
}
|
|
|
|
/// Consumes the [`Parsed`] output and returns the contained syntax node.
|
|
pub fn into_syntax(self) -> T {
|
|
self.syntax
|
|
}
|
|
|
|
/// Consumes the [`Parsed`] output and returns a list of syntax errors found during parsing.
|
|
pub fn into_errors(self) -> Vec<ParseError> {
|
|
self.errors
|
|
}
|
|
|
|
/// Returns `true` if the parsed source code is valid i.e., it has no [`ParseError`]s.
|
|
///
|
|
/// Note that this does not include version-related [`UnsupportedSyntaxError`]s.
|
|
///
|
|
/// See [`Parsed::has_no_syntax_errors`] for a version that takes these into account.
|
|
pub fn has_valid_syntax(&self) -> bool {
|
|
self.errors.is_empty()
|
|
}
|
|
|
|
/// Returns `true` if the parsed source code is invalid i.e., it has [`ParseError`]s.
|
|
///
|
|
/// Note that this does not include version-related [`UnsupportedSyntaxError`]s.
|
|
///
|
|
/// See [`Parsed::has_no_syntax_errors`] for a version that takes these into account.
|
|
pub fn has_invalid_syntax(&self) -> bool {
|
|
!self.has_valid_syntax()
|
|
}
|
|
|
|
/// Returns `true` if the parsed source code does not contain any [`ParseError`]s *or*
|
|
/// [`UnsupportedSyntaxError`]s.
|
|
///
|
|
/// See [`Parsed::has_valid_syntax`] for a version specific to [`ParseError`]s.
|
|
pub fn has_no_syntax_errors(&self) -> bool {
|
|
self.has_valid_syntax() && self.unsupported_syntax_errors.is_empty()
|
|
}
|
|
|
|
/// Returns `true` if the parsed source code contains any [`ParseError`]s *or*
|
|
/// [`UnsupportedSyntaxError`]s.
|
|
///
|
|
/// See [`Parsed::has_invalid_syntax`] for a version specific to [`ParseError`]s.
|
|
pub fn has_syntax_errors(&self) -> bool {
|
|
!self.has_no_syntax_errors()
|
|
}
|
|
|
|
/// Returns the [`Parsed`] output as a [`Result`], returning [`Ok`] if it has no syntax errors,
|
|
/// or [`Err`] containing the first [`ParseError`] encountered.
|
|
///
|
|
/// Note that any [`unsupported_syntax_errors`](Parsed::unsupported_syntax_errors) will not
|
|
/// cause [`Err`] to be returned.
|
|
pub fn as_result(&self) -> Result<&Parsed<T>, &[ParseError]> {
|
|
if self.has_valid_syntax() {
|
|
Ok(self)
|
|
} else {
|
|
Err(&self.errors)
|
|
}
|
|
}
|
|
|
|
/// Consumes the [`Parsed`] output and returns a [`Result`] which is [`Ok`] if it has no syntax
|
|
/// errors, or [`Err`] containing the first [`ParseError`] encountered.
|
|
///
|
|
/// Note that any [`unsupported_syntax_errors`](Parsed::unsupported_syntax_errors) will not
|
|
/// cause [`Err`] to be returned.
|
|
pub(crate) fn into_result(self) -> Result<Parsed<T>, ParseError> {
|
|
if self.has_valid_syntax() {
|
|
Ok(self)
|
|
} else {
|
|
Err(self.into_errors().into_iter().next().unwrap())
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Parsed<Mod> {
|
|
/// Attempts to convert the [`Parsed<Mod>`] into a [`Parsed<ModModule>`].
|
|
///
|
|
/// This method checks if the `syntax` field of the output is a [`Mod::Module`]. If it is, the
|
|
/// method returns [`Some(Parsed<ModModule>)`] with the contained module. Otherwise, it
|
|
/// returns [`None`].
|
|
///
|
|
/// [`Some(Parsed<ModModule>)`]: Some
|
|
pub fn try_into_module(self) -> Option<Parsed<ModModule>> {
|
|
match self.syntax {
|
|
Mod::Module(module) => Some(Parsed {
|
|
syntax: module,
|
|
tokens: self.tokens,
|
|
errors: self.errors,
|
|
unsupported_syntax_errors: self.unsupported_syntax_errors,
|
|
}),
|
|
Mod::Expression(_) => None,
|
|
}
|
|
}
|
|
|
|
/// Attempts to convert the [`Parsed<Mod>`] into a [`Parsed<ModExpression>`].
|
|
///
|
|
/// This method checks if the `syntax` field of the output is a [`Mod::Expression`]. If it is,
|
|
/// the method returns [`Some(Parsed<ModExpression>)`] with the contained expression.
|
|
/// Otherwise, it returns [`None`].
|
|
///
|
|
/// [`Some(Parsed<ModExpression>)`]: Some
|
|
pub fn try_into_expression(self) -> Option<Parsed<ModExpression>> {
|
|
match self.syntax {
|
|
Mod::Module(_) => None,
|
|
Mod::Expression(expression) => Some(Parsed {
|
|
syntax: expression,
|
|
tokens: self.tokens,
|
|
errors: self.errors,
|
|
unsupported_syntax_errors: self.unsupported_syntax_errors,
|
|
}),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Parsed<ModModule> {
|
|
/// Returns the module body contained in this parsed output as a [`Suite`].
|
|
pub fn suite(&self) -> &Suite {
|
|
&self.syntax.body
|
|
}
|
|
|
|
/// Consumes the [`Parsed`] output and returns the module body as a [`Suite`].
|
|
pub fn into_suite(self) -> Suite {
|
|
self.syntax.body
|
|
}
|
|
}
|
|
|
|
impl Parsed<ModExpression> {
|
|
/// Returns the expression contained in this parsed output.
|
|
pub fn expr(&self) -> &Expr {
|
|
&self.syntax.body
|
|
}
|
|
|
|
/// Returns a mutable reference to the expression contained in this parsed output.
|
|
pub fn expr_mut(&mut self) -> &mut Expr {
|
|
&mut self.syntax.body
|
|
}
|
|
|
|
/// Consumes the [`Parsed`] output and returns the contained [`Expr`].
|
|
pub fn into_expr(self) -> Expr {
|
|
*self.syntax.body
|
|
}
|
|
}
|
|
|
|
/// Tokens represents a vector of lexed [`Token`].
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct Tokens {
|
|
raw: Vec<Token>,
|
|
}
|
|
|
|
impl Tokens {
|
|
pub(crate) fn new(tokens: Vec<Token>) -> Tokens {
|
|
Tokens { raw: tokens }
|
|
}
|
|
|
|
/// Returns an iterator over all the tokens that provides context.
|
|
pub fn iter_with_context(&self) -> TokenIterWithContext {
|
|
TokenIterWithContext::new(&self.raw)
|
|
}
|
|
|
|
/// Returns a slice of [`Token`] that are within the given `range`.
|
|
///
|
|
/// The start and end offset of the given range should be either:
|
|
/// 1. Token boundary
|
|
/// 2. Gap between the tokens
|
|
///
|
|
/// For example, considering the following tokens and their corresponding range:
|
|
///
|
|
/// | Token | Range |
|
|
/// |---------------------|-----------|
|
|
/// | `Def` | `0..3` |
|
|
/// | `Name` | `4..7` |
|
|
/// | `Lpar` | `7..8` |
|
|
/// | `Rpar` | `8..9` |
|
|
/// | `Colon` | `9..10` |
|
|
/// | `Newline` | `10..11` |
|
|
/// | `Comment` | `15..24` |
|
|
/// | `NonLogicalNewline` | `24..25` |
|
|
/// | `Indent` | `25..29` |
|
|
/// | `Pass` | `29..33` |
|
|
///
|
|
/// Here, for (1) a token boundary is considered either the start or end offset of any of the
|
|
/// above tokens. For (2), the gap would be any offset between the `Newline` and `Comment`
|
|
/// token which are 12, 13, and 14.
|
|
///
|
|
/// Examples:
|
|
/// 1) `4..10` would give `Name`, `Lpar`, `Rpar`, `Colon`
|
|
/// 2) `11..25` would give `Comment`, `NonLogicalNewline`
|
|
/// 3) `12..25` would give same as (2) and offset 12 is in the "gap"
|
|
/// 4) `9..12` would give `Colon`, `Newline` and offset 12 is in the "gap"
|
|
/// 5) `18..27` would panic because both the start and end offset is within a token
|
|
///
|
|
/// ## Note
|
|
///
|
|
/// The returned slice can contain the [`TokenKind::Unknown`] token if there was a lexical
|
|
/// error encountered within the given range.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// If either the start or end offset of the given range is within a token range.
|
|
pub fn in_range(&self, range: TextRange) -> &[Token] {
|
|
let tokens_after_start = self.after(range.start());
|
|
|
|
match tokens_after_start.binary_search_by_key(&range.end(), Ranged::end) {
|
|
Ok(idx) => {
|
|
// If we found the token with the end offset, that token should be included in the
|
|
// return slice.
|
|
&tokens_after_start[..=idx]
|
|
}
|
|
Err(idx) => {
|
|
if let Some(token) = tokens_after_start.get(idx) {
|
|
// If it's equal to the start offset, then it's at a token boundary which is
|
|
// valid. If it's less than the start offset, then it's in the gap between the
|
|
// tokens which is valid as well.
|
|
assert!(
|
|
range.end() <= token.start(),
|
|
"End offset {:?} is inside a token range {:?}",
|
|
range.end(),
|
|
token.range()
|
|
);
|
|
}
|
|
|
|
// This index is where the token with the offset _could_ be, so that token should
|
|
// be excluded from the return slice.
|
|
&tokens_after_start[..idx]
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Searches the token(s) at `offset`.
|
|
///
|
|
/// Returns [`TokenAt::Between`] if `offset` points directly inbetween two tokens
|
|
/// (the left token ends at `offset` and the right token starts at `offset`).
|
|
///
|
|
///
|
|
/// ## Examples
|
|
///
|
|
/// [Playground](https://play.ruff.rs/f3ad0a55-5931-4a13-96c7-b2b8bfdc9a2e?secondary=Tokens)
|
|
///
|
|
/// ```
|
|
/// # use ruff_python_ast::PySourceType;
|
|
/// # use ruff_python_parser::{Token, TokenAt, TokenKind};
|
|
/// # use ruff_text_size::{Ranged, TextSize};
|
|
///
|
|
/// let source = r#"
|
|
/// def test(arg):
|
|
/// arg.call()
|
|
/// if True:
|
|
/// pass
|
|
/// print("true")
|
|
/// "#.trim();
|
|
///
|
|
/// let parsed = ruff_python_parser::parse_unchecked_source(source, PySourceType::Python);
|
|
/// let tokens = parsed.tokens();
|
|
///
|
|
/// let collect_tokens = |offset: TextSize| {
|
|
/// tokens.at_offset(offset).into_iter().map(|t| (t.kind(), &source[t.range()])).collect::<Vec<_>>()
|
|
/// };
|
|
///
|
|
/// assert_eq!(collect_tokens(TextSize::new(4)), vec! [(TokenKind::Name, "test")]);
|
|
/// assert_eq!(collect_tokens(TextSize::new(6)), vec! [(TokenKind::Name, "test")]);
|
|
/// // between `arg` and `.`
|
|
/// assert_eq!(collect_tokens(TextSize::new(22)), vec! [(TokenKind::Name, "arg"), (TokenKind::Dot, ".")]);
|
|
/// assert_eq!(collect_tokens(TextSize::new(36)), vec! [(TokenKind::If, "if")]);
|
|
/// // Before the dedent token
|
|
/// assert_eq!(collect_tokens(TextSize::new(57)), vec! []);
|
|
/// ```
|
|
pub fn at_offset(&self, offset: TextSize) -> TokenAt {
|
|
match self.binary_search_by_key(&offset, ruff_text_size::Ranged::start) {
|
|
// The token at `index` starts exactly at `offset.
|
|
// ```python
|
|
// object.attribute
|
|
// ^ OFFSET
|
|
// ```
|
|
Ok(index) => {
|
|
let token = self[index];
|
|
// `token` starts exactly at `offset`. Test if the offset is right between
|
|
// `token` and the previous token (if there's any)
|
|
if let Some(previous) = index.checked_sub(1).map(|idx| self[idx]) {
|
|
if previous.end() == offset {
|
|
return TokenAt::Between(previous, token);
|
|
}
|
|
}
|
|
|
|
TokenAt::Single(token)
|
|
}
|
|
|
|
// No token found that starts exactly at the given offset. But it's possible that
|
|
// the token starting before `offset` fully encloses `offset` (it's end range ends after `offset`).
|
|
// ```python
|
|
// object.attribute
|
|
// ^ OFFSET
|
|
// # or
|
|
// if True:
|
|
// print("test")
|
|
// ^ OFFSET
|
|
// ```
|
|
Err(index) => {
|
|
if let Some(previous) = index.checked_sub(1).map(|idx| self[idx]) {
|
|
if previous.range().contains_inclusive(offset) {
|
|
return TokenAt::Single(previous);
|
|
}
|
|
}
|
|
|
|
TokenAt::None
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns a slice of tokens before the given [`TextSize`] offset.
|
|
///
|
|
/// If the given offset is between two tokens, the returned slice will end just before the
|
|
/// following token. In other words, if the offset is between the end of previous token and
|
|
/// start of next token, the returned slice will end just before the next token.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// If the given offset is inside a token range at any point
|
|
/// other than the start of the range.
|
|
pub fn before(&self, offset: TextSize) -> &[Token] {
|
|
match self.binary_search_by(|token| token.start().cmp(&offset)) {
|
|
Ok(idx) => &self[..idx],
|
|
Err(idx) => {
|
|
// We can't use `saturating_sub` here because a file could contain a BOM header, in
|
|
// which case the token starts at offset 3 for UTF-8 encoded file content.
|
|
if idx > 0 {
|
|
if let Some(prev) = self.get(idx - 1) {
|
|
// If it's equal to the end offset, then it's at a token boundary which is
|
|
// valid. If it's greater than the end offset, then it's in the gap between
|
|
// the tokens which is valid as well.
|
|
assert!(
|
|
offset >= prev.end(),
|
|
"Offset {:?} is inside a token range {:?}",
|
|
offset,
|
|
prev.range()
|
|
);
|
|
}
|
|
}
|
|
|
|
&self[..idx]
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns a slice of tokens after the given [`TextSize`] offset.
|
|
///
|
|
/// If the given offset is between two tokens, the returned slice will start from the following
|
|
/// token. In other words, if the offset is between the end of previous token and start of next
|
|
/// token, the returned slice will start from the next token.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// If the given offset is inside a token range at any point
|
|
/// other than the start of the range.
|
|
pub fn after(&self, offset: TextSize) -> &[Token] {
|
|
match self.binary_search_by(|token| token.start().cmp(&offset)) {
|
|
Ok(idx) => &self[idx..],
|
|
Err(idx) => {
|
|
// We can't use `saturating_sub` here because a file could contain a BOM header, in
|
|
// which case the token starts at offset 3 for UTF-8 encoded file content.
|
|
if idx > 0 {
|
|
if let Some(prev) = self.get(idx - 1) {
|
|
// If it's equal to the end offset, then it's at a token boundary which is
|
|
// valid. If it's greater than the end offset, then it's in the gap between
|
|
// the tokens which is valid as well.
|
|
assert!(
|
|
offset >= prev.end(),
|
|
"Offset {:?} is inside a token range {:?}",
|
|
offset,
|
|
prev.range()
|
|
);
|
|
}
|
|
}
|
|
|
|
&self[idx..]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> IntoIterator for &'a Tokens {
|
|
type Item = &'a Token;
|
|
type IntoIter = std::slice::Iter<'a, Token>;
|
|
|
|
fn into_iter(self) -> Self::IntoIter {
|
|
self.iter()
|
|
}
|
|
}
|
|
|
|
impl Deref for Tokens {
|
|
type Target = [Token];
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
&self.raw
|
|
}
|
|
}
|
|
|
|
/// A token that encloses a given offset or ends exactly at it.
|
|
#[derive(Debug, Clone)]
|
|
pub enum TokenAt {
|
|
/// There's no token at the given offset
|
|
None,
|
|
|
|
/// There's a single token at the given offset.
|
|
Single(Token),
|
|
|
|
/// The offset falls exactly between two tokens. E.g. `CURSOR` in `call<CURSOR>(arguments)` is
|
|
/// positioned exactly between the `call` and `(` tokens.
|
|
Between(Token, Token),
|
|
}
|
|
|
|
impl Iterator for TokenAt {
|
|
type Item = Token;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
match *self {
|
|
TokenAt::None => None,
|
|
TokenAt::Single(token) => {
|
|
*self = TokenAt::None;
|
|
Some(token)
|
|
}
|
|
TokenAt::Between(first, second) => {
|
|
*self = TokenAt::Single(second);
|
|
Some(first)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl FusedIterator for TokenAt {}
|
|
|
|
impl From<&Tokens> for CommentRanges {
|
|
fn from(tokens: &Tokens) -> Self {
|
|
let mut ranges = vec![];
|
|
for token in tokens {
|
|
if token.kind() == TokenKind::Comment {
|
|
ranges.push(token.range());
|
|
}
|
|
}
|
|
CommentRanges::new(ranges)
|
|
}
|
|
}
|
|
|
|
/// An iterator over the [`Token`]s with context.
|
|
///
|
|
/// This struct is created by the [`iter_with_context`] method on [`Tokens`]. Refer to its
|
|
/// documentation for more details.
|
|
///
|
|
/// [`iter_with_context`]: Tokens::iter_with_context
|
|
#[derive(Debug, Clone)]
|
|
pub struct TokenIterWithContext<'a> {
|
|
inner: std::slice::Iter<'a, Token>,
|
|
nesting: u32,
|
|
}
|
|
|
|
impl<'a> TokenIterWithContext<'a> {
|
|
fn new(tokens: &'a [Token]) -> TokenIterWithContext<'a> {
|
|
TokenIterWithContext {
|
|
inner: tokens.iter(),
|
|
nesting: 0,
|
|
}
|
|
}
|
|
|
|
/// Return the nesting level the iterator is currently in.
|
|
pub const fn nesting(&self) -> u32 {
|
|
self.nesting
|
|
}
|
|
|
|
/// Returns `true` if the iterator is within a parenthesized context.
|
|
pub const fn in_parenthesized_context(&self) -> bool {
|
|
self.nesting > 0
|
|
}
|
|
|
|
/// Returns the next [`Token`] in the iterator without consuming it.
|
|
pub fn peek(&self) -> Option<&'a Token> {
|
|
self.clone().next()
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for TokenIterWithContext<'a> {
|
|
type Item = &'a Token;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let token = self.inner.next()?;
|
|
|
|
match token.kind() {
|
|
TokenKind::Lpar | TokenKind::Lbrace | TokenKind::Lsqb => self.nesting += 1,
|
|
TokenKind::Rpar | TokenKind::Rbrace | TokenKind::Rsqb => {
|
|
self.nesting = self.nesting.saturating_sub(1);
|
|
}
|
|
// This mimics the behavior of re-lexing which reduces the nesting level on the lexer.
|
|
// We don't need to reduce it by 1 because unlike the lexer we see the final token
|
|
// after recovering from every unclosed parenthesis.
|
|
TokenKind::Newline if self.nesting > 0 => {
|
|
self.nesting = 0;
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
Some(token)
|
|
}
|
|
}
|
|
|
|
impl FusedIterator for TokenIterWithContext<'_> {}
|
|
|
|
/// Control in the different modes by which a source file can be parsed.
|
|
///
|
|
/// The mode argument specifies in what way code must be parsed.
|
|
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
|
|
pub enum Mode {
|
|
/// The code consists of a sequence of statements.
|
|
Module,
|
|
|
|
/// The code consists of a single expression.
|
|
Expression,
|
|
|
|
/// The code consists of a single expression and is parsed as if it is parenthesized. The parentheses themselves aren't required.
|
|
/// This allows for having valid multiline expression without the need of parentheses
|
|
/// and is specifically useful for parsing string annotations.
|
|
ParenthesizedExpression,
|
|
|
|
/// The code consists of a sequence of statements which can include the
|
|
/// escape commands that are part of IPython syntax.
|
|
///
|
|
/// ## Supported escape commands:
|
|
///
|
|
/// - [Magic command system] which is limited to [line magics] and can start
|
|
/// with `?` or `??`.
|
|
/// - [Dynamic object information] which can start with `?` or `??`.
|
|
/// - [System shell access] which can start with `!` or `!!`.
|
|
/// - [Automatic parentheses and quotes] which can start with `/`, `;`, or `,`.
|
|
///
|
|
/// [Magic command system]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#magic-command-system
|
|
/// [line magics]: https://ipython.readthedocs.io/en/stable/interactive/magics.html#line-magics
|
|
/// [Dynamic object information]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#dynamic-object-information
|
|
/// [System shell access]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#system-shell-access
|
|
/// [Automatic parentheses and quotes]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#automatic-parentheses-and-quotes
|
|
Ipython,
|
|
}
|
|
|
|
impl std::str::FromStr for Mode {
|
|
type Err = ModeParseError;
|
|
fn from_str(s: &str) -> Result<Self, ModeParseError> {
|
|
match s {
|
|
"exec" | "single" => Ok(Mode::Module),
|
|
"eval" => Ok(Mode::Expression),
|
|
"ipython" => Ok(Mode::Ipython),
|
|
_ => Err(ModeParseError),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A type that can be represented as [Mode].
|
|
pub trait AsMode {
|
|
fn as_mode(&self) -> Mode;
|
|
}
|
|
|
|
impl AsMode for PySourceType {
|
|
fn as_mode(&self) -> Mode {
|
|
match self {
|
|
PySourceType::Python | PySourceType::Stub => Mode::Module,
|
|
PySourceType::Ipynb => Mode::Ipython,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returned when a given mode is not valid.
|
|
#[derive(Debug)]
|
|
pub struct ModeParseError;
|
|
|
|
impl std::fmt::Display for ModeParseError {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
write!(f, r#"mode must be "exec", "eval", "ipython", or "single""#)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::ops::Range;
|
|
|
|
use crate::token::TokenFlags;
|
|
|
|
use super::*;
|
|
|
|
/// Test case containing a "gap" between two tokens.
|
|
///
|
|
/// Code: <https://play.ruff.rs/a3658340-6df8-42c5-be80-178744bf1193>
|
|
const TEST_CASE_WITH_GAP: [(TokenKind, Range<u32>); 10] = [
|
|
(TokenKind::Def, 0..3),
|
|
(TokenKind::Name, 4..7),
|
|
(TokenKind::Lpar, 7..8),
|
|
(TokenKind::Rpar, 8..9),
|
|
(TokenKind::Colon, 9..10),
|
|
(TokenKind::Newline, 10..11),
|
|
// Gap ||..||
|
|
(TokenKind::Comment, 15..24),
|
|
(TokenKind::NonLogicalNewline, 24..25),
|
|
(TokenKind::Indent, 25..29),
|
|
(TokenKind::Pass, 29..33),
|
|
// No newline at the end to keep the token set full of unique tokens
|
|
];
|
|
|
|
/// Helper function to create [`Tokens`] from an iterator of (kind, range).
|
|
fn new_tokens(tokens: impl Iterator<Item = (TokenKind, Range<u32>)>) -> Tokens {
|
|
Tokens::new(
|
|
tokens
|
|
.map(|(kind, range)| {
|
|
Token::new(
|
|
kind,
|
|
TextRange::new(TextSize::new(range.start), TextSize::new(range.end)),
|
|
TokenFlags::empty(),
|
|
)
|
|
})
|
|
.collect(),
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_at_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(8));
|
|
assert_eq!(after.len(), 7);
|
|
assert_eq!(after.first().unwrap().kind(), TokenKind::Rpar);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_at_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(11));
|
|
assert_eq!(after.len(), 4);
|
|
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(13));
|
|
assert_eq!(after.len(), 4);
|
|
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_at_last_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(33));
|
|
assert_eq!(after.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "Offset 5 is inside a token range 4..7")]
|
|
fn tokens_after_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.after(TextSize::new(5));
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_first_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(0));
|
|
assert_eq!(before.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_after_first_token_gap() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(3));
|
|
assert_eq!(before.len(), 1);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Def);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_second_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(4));
|
|
assert_eq!(before.len(), 1);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Def);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(8));
|
|
assert_eq!(before.len(), 3);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Lpar);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(11));
|
|
assert_eq!(before.len(), 6);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(13));
|
|
assert_eq!(before.len(), 6);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_last_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(33));
|
|
assert_eq!(before.len(), 10);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Pass);
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "Offset 5 is inside a token range 4..7")]
|
|
fn tokens_before_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.before(TextSize::new(5));
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_at_token_offset() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(4.into(), 10.into()));
|
|
assert_eq!(in_range.len(), 4);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Name);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Colon);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_start_offset_at_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(11.into(), 29.into()));
|
|
assert_eq!(in_range.len(), 3);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_end_offset_at_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(8.into(), 15.into()));
|
|
assert_eq!(in_range.len(), 3);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Rpar);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_start_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(13.into(), 29.into()));
|
|
assert_eq!(in_range.len(), 3);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_end_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(9.into(), 13.into()));
|
|
assert_eq!(in_range.len(), 2);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Colon);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "Offset 5 is inside a token range 4..7")]
|
|
fn tokens_in_range_start_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.in_range(TextRange::new(5.into(), 10.into()));
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "End offset 6 is inside a token range 4..7")]
|
|
fn tokens_in_range_end_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.in_range(TextRange::new(0.into(), 6.into()));
|
|
}
|
|
}
|