Add basic docs for the parser crate (#11199)

## Summary

This PR adds a basic README for the `ruff_python_parser` crate and
updates the CONTRIBUTING docs with the fuzzer and benchmark section.

Additionally, it also updates some inline documentation within the
parser crate and splits the `parse_program` function into
`parse_single_expression` and `parse_module` which will be called by
matching against the `Mode`.

This PR doesn't go into too much internal detail around the parser logic
due to the following reasons:
1. Where should the docs go? Should it be as a module docs in `lib.rs`
or in README?
2. The parser is still evolving and could include a lot of refactors
with the future work (feedback loop and improved error recovery and
resilience)

---------

Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
This commit is contained in:
Dhruv Manilawala 2024-04-29 22:38:07 +05:30 committed by GitHub
parent 0ed7af35ec
commit 04a922866a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 199 additions and 67 deletions

View file

@ -26,6 +26,9 @@ mod statement;
#[cfg(test)]
mod tests;
/// Represents the parsed source code.
///
/// This includes the AST and all of the errors encountered during parsing.
#[derive(Debug)]
pub struct Program {
ast: ast::Mod,
@ -43,12 +46,12 @@ impl Program {
&self.parse_errors
}
/// Consumes the `Program` and returns the parsed AST.
/// Consumes the [`Program`] and returns the parsed AST.
pub fn into_ast(self) -> ast::Mod {
self.ast
}
/// Consumes the `Program` and returns a list of syntax errors found during parsing.
/// Consumes the [`Program`] and returns a list of syntax errors found during parsing.
pub fn into_errors(self) -> Vec<ParseError> {
self.parse_errors
}
@ -58,11 +61,13 @@ impl Program {
self.parse_errors.is_empty()
}
/// Parse the given Python source code using the specified [`Mode`].
pub fn parse_str(source: &str, mode: Mode) -> Program {
let tokens = lex(source, mode);
Self::parse_tokens(source, tokens.collect(), mode)
}
/// Parse a vector of [`LexResult`]s using the specified [`Mode`].
pub fn parse_tokens(source: &str, tokens: Vec<LexResult>, mode: Mode) -> Program {
Parser::new(source, mode, TokenSource::new(tokens)).parse_program()
}
@ -124,49 +129,11 @@ impl<'src> Parser<'src> {
}
}
/// Consumes the [`Parser`] and returns the parsed [`Program`].
pub(crate) fn parse_program(mut self) -> Program {
let ast = if self.mode == Mode::Expression {
let start = self.node_start();
let parsed_expr = self.parse_expression_list(ExpressionContext::default());
// All of the remaining newlines are actually going to be non-logical newlines.
self.eat(TokenKind::Newline);
if !self.at(TokenKind::EndOfFile) {
self.add_error(
ParseErrorType::UnexpectedExpressionToken,
self.current_token_range(),
);
// TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression?
let mut progress = ParserProgress::default();
loop {
progress.assert_progressing(&self);
if self.at(TokenKind::EndOfFile) {
break;
}
self.next_token();
}
}
self.bump(TokenKind::EndOfFile);
Mod::Expression(ast::ModExpression {
body: Box::new(parsed_expr.expr),
range: self.node_range(start),
})
} else {
let body = self.parse_list_into_vec(
RecoveryContextKind::ModuleStatements,
Parser::parse_statement,
);
self.bump(TokenKind::EndOfFile);
Mod::Module(ast::ModModule {
body,
range: self.tokens_range,
})
let ast = match self.mode {
Mode::Expression => Mod::Expression(self.parse_single_expression()),
Mode::Module | Mode::Ipython => Mod::Module(self.parse_module()),
};
Program {
@ -175,6 +142,63 @@ impl<'src> Parser<'src> {
}
}
/// Parses a single expression.
///
/// This is to be used for [`Mode::Expression`].
///
/// ## Recovery
///
/// After parsing a single expression, an error is reported and all remaining tokens are
/// dropped by the parser.
fn parse_single_expression(&mut self) -> ast::ModExpression {
let start = self.node_start();
let parsed_expr = self.parse_expression_list(ExpressionContext::default());
// All remaining newlines are actually going to be non-logical newlines.
self.eat(TokenKind::Newline);
if !self.at(TokenKind::EndOfFile) {
self.add_error(
ParseErrorType::UnexpectedExpressionToken,
self.current_token_range(),
);
// TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression?
let mut progress = ParserProgress::default();
loop {
progress.assert_progressing(self);
if self.at(TokenKind::EndOfFile) {
break;
}
self.next_token();
}
}
self.bump(TokenKind::EndOfFile);
ast::ModExpression {
body: Box::new(parsed_expr.expr),
range: self.node_range(start),
}
}
/// Parses a Python module.
///
/// This is to be used for [`Mode::Module`] and [`Mode::Ipython`].
fn parse_module(&mut self) -> ast::ModModule {
let body = self.parse_list_into_vec(
RecoveryContextKind::ModuleStatements,
Parser::parse_statement,
);
self.bump(TokenKind::EndOfFile);
ast::ModModule {
body,
range: self.tokens_range,
}
}
fn finish(self) -> Vec<ParseError> {
assert_eq!(
self.current_token_kind(),