diff --git a/crates/ruff_python_ast/src/nodes.rs b/crates/ruff_python_ast/src/nodes.rs index 3fd425684b..2a39984c53 100644 --- a/crates/ruff_python_ast/src/nodes.rs +++ b/crates/ruff_python_ast/src/nodes.rs @@ -3615,6 +3615,9 @@ impl Deref for TypeParams { } } +/// A suite represents a [Vec] of [Stmt]. +/// +/// See: pub type Suite = Vec; /// The kind of escape command as defined in [IPython Syntax] in the IPython codebase. diff --git a/crates/ruff_python_parser/CONTRIBUTING.md b/crates/ruff_python_parser/CONTRIBUTING.md index ec8c0ccfdd..1cb58a9958 100644 --- a/crates/ruff_python_parser/CONTRIBUTING.md +++ b/crates/ruff_python_parser/CONTRIBUTING.md @@ -52,3 +52,49 @@ Then, run the Parser test suite with the following command: ```sh cargo test --package ruff_python_parser ``` + +### Python-based fuzzer + +The Ruff project includes a Python-based fuzzer that can be used to run the parser on +randomly generated (but syntactically valid) Python source code files. + +To run the fuzzer, first install the required dependencies: + +```sh +uv pip install -r scripts/fuzz-parser/requirements.txt +``` + +Then, run the fuzzer with the following command: + +```sh +python scripts/fuzz-parser/fuzz.py +``` + +Refer to the [fuzz.py](https://github.com/astral-sh/ruff/blob/main/scripts/fuzz-parser/fuzz.py) +script for more information or use the `--help` flag to see the available options. + +#### CI + +The fuzzer is run as part of the CI pipeline. The purpose of running the fuzzer in the CI is to +catch any regresssions introduced by any new changes to the parser. This is why the fuzzer is run on +the same set of seeds on every run. + +## Benchmarks + +The `ruff_benchmark` crate can benchmark both the lexer and the parser. + +To run the lexer benchmarks, use the following command: + +```sh +cargo bench --package ruff_benchmark --bench lexer +``` + +And to run the parser benchmarks, use the following command: + +```sh +cargo bench --package ruff_benchmark --bench parser +``` + +Refer to the [Benchmarking and +Profiling](https://docs.astral.sh/ruff/contributing/#benchmark-driven-development) section in the +contributing guide for more information. diff --git a/crates/ruff_python_parser/README.md b/crates/ruff_python_parser/README.md new file mode 100644 index 0000000000..1273f8f0f3 --- /dev/null +++ b/crates/ruff_python_parser/README.md @@ -0,0 +1,22 @@ +# Ruff Python Parser + +Ruff's Python parser is a hand-written [recursive descent parser] which can parse +Python source code into an Abstract Syntax Tree (AST). It also utilizes the [Pratt +parsing](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html) +technique to parse expressions with different [precedence](https://docs.python.org/3/reference/expressions.html#operator-precedence). + +Try out the parser in the [playground](https://play.ruff.rs/?secondary=AST). + +## Python version support + +The parser supports the latest Python syntax, which is currently Python 3.12. +It does not throw syntax errors if it encounters a syntax feature that is not +supported by the [`target-version`](https://docs.astral.sh/ruff/settings/#target-version). +This will be fixed in a future release (see ). + +## Contributing + +Refer to the [contributing guidelines](./CONTRIBUTING.md) to get started and GitHub issues with the +[parser label](https://github.com/astral-sh/ruff/issues?q=is:open+is:issue+label:parser) for issues that need help. + +[recursive descent parser]: https://en.wikipedia.org/wiki/Recursive_descent_parser diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index ce915b04d4..ee7a7399fd 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -1,7 +1,7 @@ //! This crate can be used to parse Python source code into an Abstract //! Syntax Tree. //! -//! ## Overview: +//! ## Overview //! //! The process by which source code is parsed into an AST can be broken down //! into two general stages: [lexical analysis] and [parsing]. @@ -15,7 +15,7 @@ //! Name("print"), LeftParen, String("Hello world"), RightParen //! ``` //! -//! these tokens are then consumed by the `ruff_python_parser`, which matches them against a set of +//! These tokens are then consumed by the `ruff_python_parser`, which matches them against a set of //! grammar rules to verify that the source code is syntactically valid and to construct //! an AST that represents the source code. //! @@ -48,16 +48,16 @@ //! }, //!``` //! -//! Note: The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the `ruff_python_parser`. +//! **Note:** The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the `ruff_python_parser`. +//! Refer to the [playground](https://play.ruff.rs) for the correct representation. //! -//! ## Source code layout: +//! ## Source code layout //! //! The functionality of this crate is split into several modules: //! //! - token: This module contains the definition of the tokens that are generated by the lexer. //! - [lexer]: This module contains the lexer and is responsible for generating the tokens. -//! - `ruff_python_parser`: This module contains an interface to the `ruff_python_parser` and is responsible for generating the AST. -//! - Functions and strings have special parsing requirements that are handled in additional files. +//! - parser: This module contains an interface to the [Program] and is responsible for generating the AST. //! - mode: This module contains the definition of the different modes that the `ruff_python_parser` can be in. //! //! # Examples @@ -78,14 +78,15 @@ //! These tokens can be directly fed into the `ruff_python_parser` to generate an AST: //! //! ``` -//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all}; +//! use ruff_python_parser::lexer::lex; +//! use ruff_python_parser::{Mode, parse_tokens}; //! //! let python_source = r#" //! def is_odd(i): //! return bool(i & 1) //! "#; -//! let tokens = tokenize_all(python_source, Mode::Module); -//! let ast = parse_tokens(tokens, python_source, Mode::Module); +//! let tokens = lex(python_source, Mode::Module); +//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module); //! //! assert!(ast.is_ok()); //! ``` @@ -138,14 +139,16 @@ pub mod typing; /// For example, parsing a simple function definition and a call to that function: /// /// ``` -/// use ruff_python_parser as parser; +/// use ruff_python_parser::parse_program; +/// /// let source = r#" /// def foo(): /// return 42 /// /// print(foo()) /// "#; -/// let program = parser::parse_program(source); +/// +/// let program = parse_program(source); /// assert!(program.is_ok()); /// ``` pub fn parse_program(source: &str) -> Result { @@ -156,6 +159,28 @@ pub fn parse_program(source: &str) -> Result { } } +/// Parse a full Python program into a [`Suite`]. +/// +/// This function is similar to [`parse_program`] except that it returns the module body +/// instead of the module itself. +/// +/// # Example +/// +/// For example, parsing a simple function definition and a call to that function: +/// +/// ``` +/// use ruff_python_parser::parse_suite; +/// +/// let source = r#" +/// def foo(): +/// return 42 +/// +/// print(foo()) +/// "#; +/// +/// let body = parse_suite(source); +/// assert!(body.is_ok()); +/// ``` pub fn parse_suite(source: &str) -> Result { parse_program(source).map(|m| m.body) } @@ -169,12 +194,11 @@ pub fn parse_suite(source: &str) -> Result { /// /// For example, parsing a single expression denoting the addition of two numbers: /// -/// ``` -/// use ruff_python_parser as parser; -/// let expr = parser::parse_expression("1 + 2"); +/// ``` +/// use ruff_python_parser::parse_expression; /// +/// let expr = parse_expression("1 + 2"); /// assert!(expr.is_ok()); -/// /// ``` pub fn parse_expression(source: &str) -> Result { let lexer = lex(source, Mode::Expression).collect(); @@ -195,7 +219,7 @@ pub fn parse_expression(source: &str) -> Result { /// somewhat silly, location: /// /// ``` -/// use ruff_python_parser::{parse_expression_starts_at}; +/// use ruff_python_parser::parse_expression_starts_at; /// # use ruff_text_size::TextSize; /// /// let expr = parse_expression_starts_at("1 + 2", TextSize::from(400)); @@ -262,7 +286,7 @@ pub fn parse(source: &str, mode: Mode) -> Result { /// Parse the given Python source code using the specified [`Mode`] and [`TextSize`]. /// -/// This function allows to specify the location of the the source code, other than +/// This function allows to specify the location of the source code, other than /// that, it behaves exactly like [`parse`]. /// /// # Example @@ -298,10 +322,12 @@ pub fn parse_starts_at(source: &str, mode: Mode, offset: TextSize) -> Result, source: &str, mode: Mode) -> Result { @@ -370,13 +396,16 @@ pub fn parse_program_tokens( } /// Control in the different modes by which a source file can be parsed. +/// /// The mode argument specifies in what way code must be parsed. #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)] pub enum Mode { /// The code consists of a sequence of statements. Module, + /// The code consists of a single expression. Expression, + /// The code consists of a sequence of statements which can include the /// escape commands that are part of IPython syntax. /// @@ -408,6 +437,7 @@ impl std::str::FromStr for Mode { } } +/// A type that can be represented as [Mode]. pub trait AsMode { fn as_mode(&self) -> Mode; } diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs index 1f1db32b9b..2545b1dc86 100644 --- a/crates/ruff_python_parser/src/parser/mod.rs +++ b/crates/ruff_python_parser/src/parser/mod.rs @@ -26,6 +26,9 @@ mod statement; #[cfg(test)] mod tests; +/// Represents the parsed source code. +/// +/// This includes the AST and all of the errors encountered during parsing. #[derive(Debug)] pub struct Program { ast: ast::Mod, @@ -43,12 +46,12 @@ impl Program { &self.parse_errors } - /// Consumes the `Program` and returns the parsed AST. + /// Consumes the [`Program`] and returns the parsed AST. pub fn into_ast(self) -> ast::Mod { self.ast } - /// Consumes the `Program` and returns a list of syntax errors found during parsing. + /// Consumes the [`Program`] and returns a list of syntax errors found during parsing. pub fn into_errors(self) -> Vec { self.parse_errors } @@ -58,11 +61,13 @@ impl Program { self.parse_errors.is_empty() } + /// Parse the given Python source code using the specified [`Mode`]. pub fn parse_str(source: &str, mode: Mode) -> Program { let tokens = lex(source, mode); Self::parse_tokens(source, tokens.collect(), mode) } + /// Parse a vector of [`LexResult`]s using the specified [`Mode`]. pub fn parse_tokens(source: &str, tokens: Vec, mode: Mode) -> Program { Parser::new(source, mode, TokenSource::new(tokens)).parse_program() } @@ -124,49 +129,11 @@ impl<'src> Parser<'src> { } } + /// Consumes the [`Parser`] and returns the parsed [`Program`]. pub(crate) fn parse_program(mut self) -> Program { - let ast = if self.mode == Mode::Expression { - let start = self.node_start(); - let parsed_expr = self.parse_expression_list(ExpressionContext::default()); - - // All of the remaining newlines are actually going to be non-logical newlines. - self.eat(TokenKind::Newline); - - if !self.at(TokenKind::EndOfFile) { - self.add_error( - ParseErrorType::UnexpectedExpressionToken, - self.current_token_range(), - ); - - // TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression? - let mut progress = ParserProgress::default(); - loop { - progress.assert_progressing(&self); - if self.at(TokenKind::EndOfFile) { - break; - } - self.next_token(); - } - } - - self.bump(TokenKind::EndOfFile); - - Mod::Expression(ast::ModExpression { - body: Box::new(parsed_expr.expr), - range: self.node_range(start), - }) - } else { - let body = self.parse_list_into_vec( - RecoveryContextKind::ModuleStatements, - Parser::parse_statement, - ); - - self.bump(TokenKind::EndOfFile); - - Mod::Module(ast::ModModule { - body, - range: self.tokens_range, - }) + let ast = match self.mode { + Mode::Expression => Mod::Expression(self.parse_single_expression()), + Mode::Module | Mode::Ipython => Mod::Module(self.parse_module()), }; Program { @@ -175,6 +142,63 @@ impl<'src> Parser<'src> { } } + /// Parses a single expression. + /// + /// This is to be used for [`Mode::Expression`]. + /// + /// ## Recovery + /// + /// After parsing a single expression, an error is reported and all remaining tokens are + /// dropped by the parser. + fn parse_single_expression(&mut self) -> ast::ModExpression { + let start = self.node_start(); + let parsed_expr = self.parse_expression_list(ExpressionContext::default()); + + // All remaining newlines are actually going to be non-logical newlines. + self.eat(TokenKind::Newline); + + if !self.at(TokenKind::EndOfFile) { + self.add_error( + ParseErrorType::UnexpectedExpressionToken, + self.current_token_range(), + ); + + // TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression? + let mut progress = ParserProgress::default(); + loop { + progress.assert_progressing(self); + if self.at(TokenKind::EndOfFile) { + break; + } + self.next_token(); + } + } + + self.bump(TokenKind::EndOfFile); + + ast::ModExpression { + body: Box::new(parsed_expr.expr), + range: self.node_range(start), + } + } + + /// Parses a Python module. + /// + /// This is to be used for [`Mode::Module`] and [`Mode::Ipython`]. + fn parse_module(&mut self) -> ast::ModModule { + let body = self.parse_list_into_vec( + RecoveryContextKind::ModuleStatements, + Parser::parse_statement, + ); + + self.bump(TokenKind::EndOfFile); + + ast::ModModule { + body, + range: self.tokens_range, + } + } + fn finish(self) -> Vec { assert_eq!( self.current_token_kind(), diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index e9ae4d3beb..0e1ecfaf0c 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -3,7 +3,7 @@ //! This module defines the tokens that the lexer recognizes. The tokens are //! loosely based on the token definitions found in the [CPython source]. //! -//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h; +//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Grammar/Tokens use ruff_python_ast::{AnyStringKind, BoolOp, Int, IpyEscapeKind, Operator, UnaryOp}; use std::fmt; @@ -352,6 +352,10 @@ impl fmt::Display for Tok { } } +/// A kind of token. +/// +/// This is a lightweight representation of [`Tok`] which doesn't contain any information +/// about the token itself. #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] pub enum TokenKind { /// Token value for a name, commonly known as an identifier. diff --git a/crates/ruff_python_parser/src/typing.rs b/crates/ruff_python_parser/src/typing.rs index 477f4b466b..c8d82304e9 100644 --- a/crates/ruff_python_parser/src/typing.rs +++ b/crates/ruff_python_parser/src/typing.rs @@ -1,10 +1,13 @@ -use crate::{parse_expression, parse_expression_starts_at}; +//! This module takes care of parsing a type annotation. + use anyhow::Result; + use ruff_python_ast::relocate::relocate_expr; -use ruff_python_ast::str; -use ruff_python_ast::Expr; +use ruff_python_ast::{str, Expr}; use ruff_text_size::{TextLen, TextRange}; +use crate::{parse_expression, parse_expression_starts_at}; + #[derive(is_macro::Is, Copy, Clone, Debug)] pub enum AnnotationKind { /// The annotation is defined as part a simple string literal,