Add basic docs for the parser crate (#11199)

## Summary

This PR adds a basic README for the `ruff_python_parser` crate and
updates the CONTRIBUTING docs with the fuzzer and benchmark section.

Additionally, it also updates some inline documentation within the
parser crate and splits the `parse_program` function into
`parse_single_expression` and `parse_module` which will be called by
matching against the `Mode`.

This PR doesn't go into too much internal detail around the parser logic
due to the following reasons:
1. Where should the docs go? Should it be as a module docs in `lib.rs`
or in README?
2. The parser is still evolving and could include a lot of refactors
with the future work (feedback loop and improved error recovery and
resilience)

---------

Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
This commit is contained in:
Dhruv Manilawala 2024-04-29 22:38:07 +05:30 committed by GitHub
parent 0ed7af35ec
commit 04a922866a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 199 additions and 67 deletions

View file

@ -1,7 +1,7 @@
//! This crate can be used to parse Python source code into an Abstract
//! Syntax Tree.
//!
//! ## Overview:
//! ## Overview
//!
//! The process by which source code is parsed into an AST can be broken down
//! into two general stages: [lexical analysis] and [parsing].
@ -15,7 +15,7 @@
//! Name("print"), LeftParen, String("Hello world"), RightParen
//! ```
//!
//! these tokens are then consumed by the `ruff_python_parser`, which matches them against a set of
//! These tokens are then consumed by the `ruff_python_parser`, which matches them against a set of
//! grammar rules to verify that the source code is syntactically valid and to construct
//! an AST that represents the source code.
//!
@ -48,16 +48,16 @@
//! },
//!```
//!
//! Note: The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the `ruff_python_parser`.
//! **Note:** The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the `ruff_python_parser`.
//! Refer to the [playground](https://play.ruff.rs) for the correct representation.
//!
//! ## Source code layout:
//! ## Source code layout
//!
//! The functionality of this crate is split into several modules:
//!
//! - token: This module contains the definition of the tokens that are generated by the lexer.
//! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
//! - `ruff_python_parser`: This module contains an interface to the `ruff_python_parser` and is responsible for generating the AST.
//! - Functions and strings have special parsing requirements that are handled in additional files.
//! - parser: This module contains an interface to the [Program] and is responsible for generating the AST.
//! - mode: This module contains the definition of the different modes that the `ruff_python_parser` can be in.
//!
//! # Examples
@ -78,14 +78,15 @@
//! These tokens can be directly fed into the `ruff_python_parser` to generate an AST:
//!
//! ```
//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all};
//! use ruff_python_parser::lexer::lex;
//! use ruff_python_parser::{Mode, parse_tokens};
//!
//! let python_source = r#"
//! def is_odd(i):
//! return bool(i & 1)
//! "#;
//! let tokens = tokenize_all(python_source, Mode::Module);
//! let ast = parse_tokens(tokens, python_source, Mode::Module);
//! let tokens = lex(python_source, Mode::Module);
//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module);
//!
//! assert!(ast.is_ok());
//! ```
@ -138,14 +139,16 @@ pub mod typing;
/// For example, parsing a simple function definition and a call to that function:
///
/// ```
/// use ruff_python_parser as parser;
/// use ruff_python_parser::parse_program;
///
/// let source = r#"
/// def foo():
/// return 42
///
/// print(foo())
/// "#;
/// let program = parser::parse_program(source);
///
/// let program = parse_program(source);
/// assert!(program.is_ok());
/// ```
pub fn parse_program(source: &str) -> Result<ModModule, ParseError> {
@ -156,6 +159,28 @@ pub fn parse_program(source: &str) -> Result<ModModule, ParseError> {
}
}
/// Parse a full Python program into a [`Suite`].
///
/// This function is similar to [`parse_program`] except that it returns the module body
/// instead of the module itself.
///
/// # Example
///
/// For example, parsing a simple function definition and a call to that function:
///
/// ```
/// use ruff_python_parser::parse_suite;
///
/// let source = r#"
/// def foo():
/// return 42
///
/// print(foo())
/// "#;
///
/// let body = parse_suite(source);
/// assert!(body.is_ok());
/// ```
pub fn parse_suite(source: &str) -> Result<Suite, ParseError> {
parse_program(source).map(|m| m.body)
}
@ -169,12 +194,11 @@ pub fn parse_suite(source: &str) -> Result<Suite, ParseError> {
///
/// For example, parsing a single expression denoting the addition of two numbers:
///
/// ```
/// use ruff_python_parser as parser;
/// let expr = parser::parse_expression("1 + 2");
/// ```
/// use ruff_python_parser::parse_expression;
///
/// let expr = parse_expression("1 + 2");
/// assert!(expr.is_ok());
///
/// ```
pub fn parse_expression(source: &str) -> Result<Expr, ParseError> {
let lexer = lex(source, Mode::Expression).collect();
@ -195,7 +219,7 @@ pub fn parse_expression(source: &str) -> Result<Expr, ParseError> {
/// somewhat silly, location:
///
/// ```
/// use ruff_python_parser::{parse_expression_starts_at};
/// use ruff_python_parser::parse_expression_starts_at;
/// # use ruff_text_size::TextSize;
///
/// let expr = parse_expression_starts_at("1 + 2", TextSize::from(400));
@ -262,7 +286,7 @@ pub fn parse(source: &str, mode: Mode) -> Result<Mod, ParseError> {
/// Parse the given Python source code using the specified [`Mode`] and [`TextSize`].
///
/// This function allows to specify the location of the the source code, other than
/// This function allows to specify the location of the source code, other than
/// that, it behaves exactly like [`parse`].
///
/// # Example
@ -298,10 +322,12 @@ pub fn parse_starts_at(source: &str, mode: Mode, offset: TextSize) -> Result<Mod
/// them using the [`lexer::lex`] function:
///
/// ```
/// use ruff_python_parser::{lexer::lex, Mode, parse_tokens};
/// use ruff_python_parser::lexer::lex;
/// use ruff_python_parser::{Mode, parse_tokens};
///
/// let source = "1 + 2";
/// let expr = parse_tokens(lex(source, Mode::Expression).collect(), source, Mode::Expression);
/// let tokens = lex(source, Mode::Expression);
/// let expr = parse_tokens(tokens.collect(), source, Mode::Expression);
/// assert!(expr.is_ok());
/// ```
pub fn parse_tokens(tokens: Vec<LexResult>, source: &str, mode: Mode) -> Result<Mod, ParseError> {
@ -370,13 +396,16 @@ pub fn parse_program_tokens(
}
/// Control in the different modes by which a source file can be parsed.
///
/// The mode argument specifies in what way code must be parsed.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum Mode {
/// The code consists of a sequence of statements.
Module,
/// The code consists of a single expression.
Expression,
/// The code consists of a sequence of statements which can include the
/// escape commands that are part of IPython syntax.
///
@ -408,6 +437,7 @@ impl std::str::FromStr for Mode {
}
}
/// A type that can be represented as [Mode].
pub trait AsMode {
fn as_mode(&self) -> Mode;
}

View file

@ -26,6 +26,9 @@ mod statement;
#[cfg(test)]
mod tests;
/// Represents the parsed source code.
///
/// This includes the AST and all of the errors encountered during parsing.
#[derive(Debug)]
pub struct Program {
ast: ast::Mod,
@ -43,12 +46,12 @@ impl Program {
&self.parse_errors
}
/// Consumes the `Program` and returns the parsed AST.
/// Consumes the [`Program`] and returns the parsed AST.
pub fn into_ast(self) -> ast::Mod {
self.ast
}
/// Consumes the `Program` and returns a list of syntax errors found during parsing.
/// Consumes the [`Program`] and returns a list of syntax errors found during parsing.
pub fn into_errors(self) -> Vec<ParseError> {
self.parse_errors
}
@ -58,11 +61,13 @@ impl Program {
self.parse_errors.is_empty()
}
/// Parse the given Python source code using the specified [`Mode`].
pub fn parse_str(source: &str, mode: Mode) -> Program {
let tokens = lex(source, mode);
Self::parse_tokens(source, tokens.collect(), mode)
}
/// Parse a vector of [`LexResult`]s using the specified [`Mode`].
pub fn parse_tokens(source: &str, tokens: Vec<LexResult>, mode: Mode) -> Program {
Parser::new(source, mode, TokenSource::new(tokens)).parse_program()
}
@ -124,49 +129,11 @@ impl<'src> Parser<'src> {
}
}
/// Consumes the [`Parser`] and returns the parsed [`Program`].
pub(crate) fn parse_program(mut self) -> Program {
let ast = if self.mode == Mode::Expression {
let start = self.node_start();
let parsed_expr = self.parse_expression_list(ExpressionContext::default());
// All of the remaining newlines are actually going to be non-logical newlines.
self.eat(TokenKind::Newline);
if !self.at(TokenKind::EndOfFile) {
self.add_error(
ParseErrorType::UnexpectedExpressionToken,
self.current_token_range(),
);
// TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression?
let mut progress = ParserProgress::default();
loop {
progress.assert_progressing(&self);
if self.at(TokenKind::EndOfFile) {
break;
}
self.next_token();
}
}
self.bump(TokenKind::EndOfFile);
Mod::Expression(ast::ModExpression {
body: Box::new(parsed_expr.expr),
range: self.node_range(start),
})
} else {
let body = self.parse_list_into_vec(
RecoveryContextKind::ModuleStatements,
Parser::parse_statement,
);
self.bump(TokenKind::EndOfFile);
Mod::Module(ast::ModModule {
body,
range: self.tokens_range,
})
let ast = match self.mode {
Mode::Expression => Mod::Expression(self.parse_single_expression()),
Mode::Module | Mode::Ipython => Mod::Module(self.parse_module()),
};
Program {
@ -175,6 +142,63 @@ impl<'src> Parser<'src> {
}
}
/// Parses a single expression.
///
/// This is to be used for [`Mode::Expression`].
///
/// ## Recovery
///
/// After parsing a single expression, an error is reported and all remaining tokens are
/// dropped by the parser.
fn parse_single_expression(&mut self) -> ast::ModExpression {
let start = self.node_start();
let parsed_expr = self.parse_expression_list(ExpressionContext::default());
// All remaining newlines are actually going to be non-logical newlines.
self.eat(TokenKind::Newline);
if !self.at(TokenKind::EndOfFile) {
self.add_error(
ParseErrorType::UnexpectedExpressionToken,
self.current_token_range(),
);
// TODO(dhruvmanila): How should error recovery work here? Just truncate after the expression?
let mut progress = ParserProgress::default();
loop {
progress.assert_progressing(self);
if self.at(TokenKind::EndOfFile) {
break;
}
self.next_token();
}
}
self.bump(TokenKind::EndOfFile);
ast::ModExpression {
body: Box::new(parsed_expr.expr),
range: self.node_range(start),
}
}
/// Parses a Python module.
///
/// This is to be used for [`Mode::Module`] and [`Mode::Ipython`].
fn parse_module(&mut self) -> ast::ModModule {
let body = self.parse_list_into_vec(
RecoveryContextKind::ModuleStatements,
Parser::parse_statement,
);
self.bump(TokenKind::EndOfFile);
ast::ModModule {
body,
range: self.tokens_range,
}
}
fn finish(self) -> Vec<ParseError> {
assert_eq!(
self.current_token_kind(),

View file

@ -3,7 +3,7 @@
//! This module defines the tokens that the lexer recognizes. The tokens are
//! loosely based on the token definitions found in the [CPython source].
//!
//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h;
//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Grammar/Tokens
use ruff_python_ast::{AnyStringKind, BoolOp, Int, IpyEscapeKind, Operator, UnaryOp};
use std::fmt;
@ -352,6 +352,10 @@ impl fmt::Display for Tok {
}
}
/// A kind of token.
///
/// This is a lightweight representation of [`Tok`] which doesn't contain any information
/// about the token itself.
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
pub enum TokenKind {
/// Token value for a name, commonly known as an identifier.

View file

@ -1,10 +1,13 @@
use crate::{parse_expression, parse_expression_starts_at};
//! This module takes care of parsing a type annotation.
use anyhow::Result;
use ruff_python_ast::relocate::relocate_expr;
use ruff_python_ast::str;
use ruff_python_ast::Expr;
use ruff_python_ast::{str, Expr};
use ruff_text_size::{TextLen, TextRange};
use crate::{parse_expression, parse_expression_starts_at};
#[derive(is_macro::Is, Copy, Clone, Debug)]
pub enum AnnotationKind {
/// The annotation is defined as part a simple string literal,