Add basic docs for the parser crate (#11199)

## Summary This PR adds a basic README for the `ruff_python_parser` crate and updates the CONTRIBUTING docs with the fuzzer and benchmark section. Additionally, it also updates some inline documentation within the parser crate and splits the `parse_program` function into `parse_single_expression` and `parse_module` which will be called by matching against the `Mode`. This PR doesn't go into too much internal detail around the parser logic due to the following reasons: 1. Where should the docs go? Should it be as a module docs in `lib.rs` or in README? 2. The parser is still evolving and could include a lot of refactors with the future work (feedback loop and improved error recovery and resilience) --------- Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
2025-10-02 06:42:02 +00:00 · 2024-04-29 22:38:07 +05:30 · 2024-04-29 22:38:07 +05:30 · 04a922866a
commit 04a922866a
parent 0ed7af35ec
7 changed files with 199 additions and 67 deletions
--- a/crates/ruff_python_parser/src/parser/mod.rs
+++ b/crates/ruff_python_parser/src/parser/mod.rs
@ -26,6 +26,9 @@ mod statement;
 #[cfg(test)]
 mod tests;

+/// Represents the parsed source code.
+///
+/// This includes the AST and all of the errors encountered during parsing.
 #[derive(Debug)]
 pub struct Program {
    ast: ast::Mod,
@ -43,12 +46,12 @@ impl Program {
        &self.parse_errors
    }

-    /// Consumes the `Program` and returns the parsed AST.
+    /// Consumes the [`Program`] and returns the parsed AST.
    pub fn into_ast(self) -> ast::Mod {
        self.ast
    }

-    /// Consumes the `Program` and returns a list of syntax errors found during parsing.
+    /// Consumes the [`Program`] and returns a list of syntax errors found during parsing.
    pub fn into_errors(self) -> Vec<ParseError> {
        self.parse_errors
    }
@ -58,11 +61,13 @@ impl Program {
        self.parse_errors.is_empty()
    }

+    /// Parse the given Python source code using the specified [`Mode`].
    pub fn parse_str(source: &str, mode: Mode) -> Program {
        let tokens = lex(source, mode);
        Self::parse_tokens(source, tokens.collect(), mode)
    }

+    /// Parse a vector of [`LexResult`]s using the specified [`Mode`].
    pub fn parse_tokens(source: &str, tokens: Vec<LexResult>, mode: Mode) -> Program {
        Parser::new(source, mode, TokenSource::new(tokens)).parse_program()
    }
@ -124,49 +129,11 @@ impl<'src> Parser<'src> {
        }
    }

+    /// Consumes the [`Parser`] and returns the parsed [`Program`].
    pub(crate) fn parse_program(mut self) -> Program {
-        let ast = if self.mode == Mode::Expression {
-            let start = self.node_start();
-            let parsed_expr = self.parse_expression_list(ExpressionContext::default());
-
-            // All of the remaining newlines are actually going to be non-logical newlines.
-            self.eat(TokenKind::Newline);
-
-            if !self.at(TokenKind::EndOfFile) {
-                self.add_error(
-                    ParseErrorType::UnexpectedExpressionToken,
-                    self.current_token_range(),
-                );
-
-                // TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression?
-                let mut progress = ParserProgress::default();
-                loop {
-                    progress.assert_progressing(&self);
-                    if self.at(TokenKind::EndOfFile) {
-                        break;
-                    }
-                    self.next_token();
-                }
-            }
-
-            self.bump(TokenKind::EndOfFile);
-
-            Mod::Expression(ast::ModExpression {
-                body: Box::new(parsed_expr.expr),
-                range: self.node_range(start),
-            })
-        } else {
-            let body = self.parse_list_into_vec(
-                RecoveryContextKind::ModuleStatements,
-                Parser::parse_statement,
-            );
-
-            self.bump(TokenKind::EndOfFile);
-
-            Mod::Module(ast::ModModule {
-                body,
-                range: self.tokens_range,
-            })
+        let ast = match self.mode {
+            Mode::Expression => Mod::Expression(self.parse_single_expression()),
+            Mode::Module | Mode::Ipython => Mod::Module(self.parse_module()),
        };

        Program {
@ -175,6 +142,63 @@ impl<'src> Parser<'src> {
        }
    }

+    /// Parses a single expression.
+    ///
+    /// This is to be used for [`Mode::Expression`].
+    ///
+    /// ## Recovery
+    ///
+    /// After parsing a single expression, an error is reported and all remaining tokens are
+    /// dropped by the parser.
+    fn parse_single_expression(&mut self) -> ast::ModExpression {
+        let start = self.node_start();
+        let parsed_expr = self.parse_expression_list(ExpressionContext::default());
+
+        // All remaining newlines are actually going to be non-logical newlines.
+        self.eat(TokenKind::Newline);
+
+        if !self.at(TokenKind::EndOfFile) {
+            self.add_error(
+                ParseErrorType::UnexpectedExpressionToken,
+                self.current_token_range(),
+            );
+
+            // TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression?
+            let mut progress = ParserProgress::default();
+            loop {
+                progress.assert_progressing(self);
+                if self.at(TokenKind::EndOfFile) {
+                    break;
+                }
+                self.next_token();
+            }
+        }
+
+        self.bump(TokenKind::EndOfFile);
+
+        ast::ModExpression {
+            body: Box::new(parsed_expr.expr),
+            range: self.node_range(start),
+        }
+    }
+
+    /// Parses a Python module.
+    ///
+    /// This is to be used for [`Mode::Module`] and [`Mode::Ipython`].
+    fn parse_module(&mut self) -> ast::ModModule {
+        let body = self.parse_list_into_vec(
+            RecoveryContextKind::ModuleStatements,
+            Parser::parse_statement,
+        );
+
+        self.bump(TokenKind::EndOfFile);
+
+        ast::ModModule {
+            body,
+            range: self.tokens_range,
+        }
+    }
+
    fn finish(self) -> Vec<ParseError> {
        assert_eq!(
            self.current_token_kind(),