mirror of
https://github.com/RustPython/Parser.git
synced 2025-07-09 22:25:23 +00:00
Merge pull request #4492 from DimitrisJim/doc_parser_uno
Document parser crate.
This commit is contained in:
commit
fb4bc89812
6 changed files with 429 additions and 86 deletions
|
@ -1,6 +1,6 @@
|
||||||
use rustpython_ast::{Expr, ExprContext, ExprKind};
|
use rustpython_ast::{Expr, ExprContext, ExprKind};
|
||||||
|
|
||||||
pub fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
|
pub(crate) fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
|
||||||
match expr.node {
|
match expr.node {
|
||||||
ExprKind::Name { id, .. } => Expr {
|
ExprKind::Name { id, .. } => Expr {
|
||||||
node: ExprKind::Name { id, ctx },
|
node: ExprKind::Name { id, ctx },
|
||||||
|
|
|
@ -1,40 +1,71 @@
|
||||||
//! Define internal parse error types
|
//! Error types for the parser.
|
||||||
//! The goal is to provide a matching and a safe error API, maksing errors from LALR
|
//!
|
||||||
|
//! These types are used to represent errors that occur during lexing and parsing and are
|
||||||
|
//! returned by the `parse_*` functions in the [parser] module and the iterator in the
|
||||||
|
//! [lexer] implementation.
|
||||||
|
//!
|
||||||
|
//! [parser]: crate::parser
|
||||||
|
//! [lexer]: crate::lexer
|
||||||
|
|
||||||
|
// Define internal parse error types.
|
||||||
|
// The goal is to provide a matching and a safe error API, masking errors from LALR
|
||||||
use crate::{ast::Location, token::Tok};
|
use crate::{ast::Location, token::Tok};
|
||||||
use lalrpop_util::ParseError as LalrpopError;
|
use lalrpop_util::ParseError as LalrpopError;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
/// Represents an error during lexical scanning.
|
/// Represents an error during lexing.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub struct LexicalError {
|
pub struct LexicalError {
|
||||||
|
/// The type of error that occurred.
|
||||||
pub error: LexicalErrorType,
|
pub error: LexicalErrorType,
|
||||||
|
/// The location of the error.
|
||||||
pub location: Location,
|
pub location: Location,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LexicalError {
|
impl LexicalError {
|
||||||
|
/// Creates a new `LexicalError` with the given error type and location.
|
||||||
pub fn new(error: LexicalErrorType, location: Location) -> Self {
|
pub fn new(error: LexicalErrorType, location: Location) -> Self {
|
||||||
Self { error, location }
|
Self { error, location }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Represents the different types of errors that can occur during lexing.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum LexicalErrorType {
|
pub enum LexicalErrorType {
|
||||||
|
// TODO: Can probably be removed, the places it is used seem to be able
|
||||||
|
// to use the `UnicodeError` variant instead.
|
||||||
|
#[doc(hidden)]
|
||||||
StringError,
|
StringError,
|
||||||
|
// TODO: Should take a start/end position to report.
|
||||||
|
/// Decoding of a unicode escape sequence in a string literal failed.
|
||||||
UnicodeError,
|
UnicodeError,
|
||||||
|
/// The nesting of brackets/braces/parentheses is not balanced.
|
||||||
NestingError,
|
NestingError,
|
||||||
|
/// The indentation is not consistent.
|
||||||
IndentationError,
|
IndentationError,
|
||||||
|
/// Inconsistent use of tabs and spaces.
|
||||||
TabError,
|
TabError,
|
||||||
|
/// Encountered a tab after a space.
|
||||||
TabsAfterSpaces,
|
TabsAfterSpaces,
|
||||||
|
/// A non-default argument follows a default argument.
|
||||||
DefaultArgumentError,
|
DefaultArgumentError,
|
||||||
|
/// A duplicate argument was found in a function definition.
|
||||||
DuplicateArgumentError(String),
|
DuplicateArgumentError(String),
|
||||||
|
/// A positional argument follows a keyword argument.
|
||||||
PositionalArgumentError,
|
PositionalArgumentError,
|
||||||
|
/// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
|
||||||
UnpackedArgumentError,
|
UnpackedArgumentError,
|
||||||
|
/// A keyword argument was repeated.
|
||||||
DuplicateKeywordArgumentError(String),
|
DuplicateKeywordArgumentError(String),
|
||||||
|
/// An unrecognized token was encountered.
|
||||||
UnrecognizedToken { tok: char },
|
UnrecognizedToken { tok: char },
|
||||||
|
/// An f-string error containing the [`FStringErrorType`].
|
||||||
FStringError(FStringErrorType),
|
FStringError(FStringErrorType),
|
||||||
|
/// An unexpected character was encountered after a line continuation.
|
||||||
LineContinuationError,
|
LineContinuationError,
|
||||||
|
/// An unexpected end of file was encountered.
|
||||||
Eof,
|
Eof,
|
||||||
|
/// An unexpected error occurred.
|
||||||
OtherError(String),
|
OtherError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -85,13 +116,17 @@ impl fmt::Display for LexicalErrorType {
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: consolidate these with ParseError
|
// TODO: consolidate these with ParseError
|
||||||
|
/// An error that occurred during parsing of an f-string.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub struct FStringError {
|
pub struct FStringError {
|
||||||
|
/// The type of error that occurred.
|
||||||
pub error: FStringErrorType,
|
pub error: FStringErrorType,
|
||||||
|
/// The location of the error.
|
||||||
pub location: Location,
|
pub location: Location,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FStringError {
|
impl FStringError {
|
||||||
|
/// Creates a new `FStringError` with the given error type and location.
|
||||||
pub fn new(error: FStringErrorType, location: Location) -> Self {
|
pub fn new(error: FStringErrorType, location: Location) -> Self {
|
||||||
Self { error, location }
|
Self { error, location }
|
||||||
}
|
}
|
||||||
|
@ -106,19 +141,33 @@ impl From<FStringError> for LexicalError {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Represents the different types of errors that can occur during parsing of an f-string.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum FStringErrorType {
|
pub enum FStringErrorType {
|
||||||
|
/// Expected a right brace after an opened left brace.
|
||||||
UnclosedLbrace,
|
UnclosedLbrace,
|
||||||
|
/// Expected a left brace after an ending right brace.
|
||||||
UnopenedRbrace,
|
UnopenedRbrace,
|
||||||
|
/// Expected a right brace after a conversion flag.
|
||||||
ExpectedRbrace,
|
ExpectedRbrace,
|
||||||
|
/// An error occurred while parsing an f-string expression.
|
||||||
InvalidExpression(Box<ParseErrorType>),
|
InvalidExpression(Box<ParseErrorType>),
|
||||||
|
/// An invalid conversion flag was encountered.
|
||||||
InvalidConversionFlag,
|
InvalidConversionFlag,
|
||||||
|
/// An empty expression was encountered.
|
||||||
EmptyExpression,
|
EmptyExpression,
|
||||||
|
/// An opening delimiter was not closed properly.
|
||||||
MismatchedDelimiter(char, char),
|
MismatchedDelimiter(char, char),
|
||||||
|
/// Too many nested expressions in an f-string.
|
||||||
ExpressionNestedTooDeeply,
|
ExpressionNestedTooDeeply,
|
||||||
|
/// The f-string expression cannot include the given character.
|
||||||
ExpressionCannotInclude(char),
|
ExpressionCannotInclude(char),
|
||||||
|
/// A single right brace was encountered.
|
||||||
SingleRbrace,
|
SingleRbrace,
|
||||||
|
/// A closing delimiter was not opened properly.
|
||||||
Unmatched(char),
|
Unmatched(char),
|
||||||
|
// TODO: Test this case.
|
||||||
|
/// Unterminated string.
|
||||||
UnterminatedString,
|
UnterminatedString,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,9 +216,10 @@ impl From<FStringError> for LalrpopError<Location, Tok, LexicalError> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents an error during parsing
|
/// Represents an error during parsing.
|
||||||
pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
|
pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
|
||||||
|
|
||||||
|
/// Represents the different types of errors that can occur during parsing.
|
||||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||||
pub enum ParseErrorType {
|
pub enum ParseErrorType {
|
||||||
/// Parser encountered an unexpected end of input
|
/// Parser encountered an unexpected end of input
|
||||||
|
@ -180,11 +230,12 @@ pub enum ParseErrorType {
|
||||||
InvalidToken,
|
InvalidToken,
|
||||||
/// Parser encountered an unexpected token
|
/// Parser encountered an unexpected token
|
||||||
UnrecognizedToken(Tok, Option<String>),
|
UnrecognizedToken(Tok, Option<String>),
|
||||||
/// Maps to `User` type from `lalrpop-util`
|
// Maps to `User` type from `lalrpop-util`
|
||||||
|
/// Parser encountered an error during lexing.
|
||||||
Lexical(LexicalErrorType),
|
Lexical(LexicalErrorType),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert `lalrpop_util::ParseError` to our internal type
|
// Convert `lalrpop_util::ParseError` to our internal type
|
||||||
pub(crate) fn parse_error_from_lalrpop(
|
pub(crate) fn parse_error_from_lalrpop(
|
||||||
err: LalrpopError<Location, Tok, LexicalError>,
|
err: LalrpopError<Location, Tok, LexicalError>,
|
||||||
source_path: &str,
|
source_path: &str,
|
||||||
|
@ -258,6 +309,7 @@ impl fmt::Display for ParseErrorType {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ParseErrorType {
|
impl ParseErrorType {
|
||||||
|
/// Returns true if the error is an indentation error.
|
||||||
pub fn is_indentation_error(&self) -> bool {
|
pub fn is_indentation_error(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
|
ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
|
||||||
|
@ -267,6 +319,8 @@ impl ParseErrorType {
|
||||||
_ => false,
|
_ => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if the error is a tab error.
|
||||||
pub fn is_tab_error(&self) -> bool {
|
pub fn is_tab_error(&self) -> bool {
|
||||||
matches!(
|
matches!(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -1,19 +1,119 @@
|
||||||
//! This crate can be used to parse python sourcecode into a so
|
//! This crate can be used to parse Python source code into an Abstract
|
||||||
//! called AST (abstract syntax tree).
|
//! Syntax Tree.
|
||||||
//!
|
//!
|
||||||
//! The stages involved in this process are lexical analysis and
|
//! ## Overview:
|
||||||
//! parsing. The lexical analysis splits the sourcecode into
|
|
||||||
//! tokens, and the parsing transforms those tokens into an AST.
|
|
||||||
//!
|
//!
|
||||||
//! For example, one could do this:
|
//! The process by which source code is parsed into an AST can be broken down
|
||||||
|
//! into two general stages: [lexical analysis] and [parsing].
|
||||||
|
//!
|
||||||
|
//! During lexical analysis, the source code is converted into a stream of lexical
|
||||||
|
//! tokens that represent the smallest meaningful units of the language. For example,
|
||||||
|
//! the source code `print("Hello world")` would _roughly_ be converted into the following
|
||||||
|
//! stream of tokens:
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! Name("print"), LeftParen, String("Hello world"), RightParen
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! these tokens are then consumed by the parser, which matches them against a set of
|
||||||
|
//! grammar rules to verify that the source code is syntactically valid and to construct
|
||||||
|
//! an AST that represents the source code.
|
||||||
|
//!
|
||||||
|
//! During parsing, the parser consumes the tokens generated by the lexer and constructs
|
||||||
|
//! a tree representation of the source code. The tree is made up of nodes that represent
|
||||||
|
//! the different syntactic constructs of the language. If the source code is syntactically
|
||||||
|
//! invalid, parsing fails and an error is returned. After a successful parse, the AST can
|
||||||
|
//! be used to perform further analysis on the source code. Continuing with the example
|
||||||
|
//! above, the AST generated by the parser would _roughly_ look something like this:
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! node: Expr {
|
||||||
|
//! value: {
|
||||||
|
//! node: Call {
|
||||||
|
//! func: {
|
||||||
|
//! node: Name {
|
||||||
|
//! id: "print",
|
||||||
|
//! ctx: Load,
|
||||||
|
//! },
|
||||||
|
//! },
|
||||||
|
//! args: [
|
||||||
|
//! node: Constant {
|
||||||
|
//! value: Str("Hello World"),
|
||||||
|
//! kind: None,
|
||||||
|
//! },
|
||||||
|
//! ],
|
||||||
|
//! keywords: [],
|
||||||
|
//! },
|
||||||
|
//! },
|
||||||
|
//! },
|
||||||
|
//!```
|
||||||
|
//!
|
||||||
|
//! Note: The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the parser.
|
||||||
|
//!
|
||||||
|
//! ## Source code layout:
|
||||||
|
//!
|
||||||
|
//! The functionality of this crate is split into several modules:
|
||||||
|
//!
|
||||||
|
//! - [token]: This module contains the definition of the tokens that are generated by the lexer.
|
||||||
|
//! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
|
||||||
|
//! - [parser]: This module contains an interface to the parser and is responsible for generating the AST.
|
||||||
|
//! - Functions and strings have special parsing requirements that are handled in additional files.
|
||||||
|
//! - [mode]: This module contains the definition of the different modes that the parser can be in.
|
||||||
|
//! - [error]: This module contains the definition of the errors that can be returned by the parser.
|
||||||
|
//!
|
||||||
|
//! # Examples
|
||||||
|
//!
|
||||||
|
//! For example, to get a stream of tokens from a given string, one could do this:
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//! use rustpython_parser::{parser, ast};
|
//! use rustpython_parser::lexer::make_tokenizer;
|
||||||
//!
|
//!
|
||||||
//! let python_source = "print('Hello world')";
|
//! let python_source = r#"
|
||||||
//! let python_ast = parser::parse_expression(python_source, "<embedded>").unwrap();
|
//! def is_odd(i):
|
||||||
|
//! return bool(i & 1)
|
||||||
|
//! "#;
|
||||||
|
//! let mut tokens = make_tokenizer(python_source);
|
||||||
|
//! assert!(tokens.all(|t| t.is_ok()));
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! These tokens can be directly fed into the parser to generate an AST:
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
|
//! use rustpython_parser::parser::{parse_tokens, Mode};
|
||||||
|
//! use rustpython_parser::lexer::make_tokenizer;
|
||||||
|
//!
|
||||||
|
//! let python_source = r#"
|
||||||
|
//! def is_odd(i):
|
||||||
|
//! return bool(i & 1)
|
||||||
|
//! "#;
|
||||||
|
//! let tokens = make_tokenizer(python_source);
|
||||||
|
//! let ast = parse_tokens(tokens, Mode::Module, "<embedded>");
|
||||||
|
//!
|
||||||
|
//! assert!(ast.is_ok());
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! Alternatively, you can use one of the other `parse_*` functions to parse a string directly without using a specific
|
||||||
|
//! mode or tokenizing the source beforehand:
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! use rustpython_parser::parser::parse_program;
|
||||||
|
//!
|
||||||
|
//! let python_source = r#"
|
||||||
|
//! def is_odd(i):
|
||||||
|
//! return bool(i & 1)
|
||||||
|
//! "#;
|
||||||
|
//! let ast = parse_program(python_source, "<embedded>");
|
||||||
|
//!
|
||||||
|
//! assert!(ast.is_ok());
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
|
||||||
|
//! [parsing]: https://en.wikipedia.org/wiki/Parsing
|
||||||
|
//! [token]: crate::token
|
||||||
|
//! [lexer]: crate::lexer
|
||||||
|
//! [parser]: crate::parser
|
||||||
|
//! [mode]: crate::mode
|
||||||
|
//! [error]: crate::error
|
||||||
|
|
||||||
#![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")]
|
#![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")]
|
||||||
#![doc(html_root_url = "https://docs.rs/rustpython-parser/")]
|
#![doc(html_root_url = "https://docs.rs/rustpython-parser/")]
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
|
//! Control in the different modes by which a source file can be parsed.
|
||||||
use crate::token::Tok;
|
use crate::token::Tok;
|
||||||
|
|
||||||
|
/// The mode argument specifies in what way code must be parsed.
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
pub enum Mode {
|
pub enum Mode {
|
||||||
|
/// The code consists of a sequence of statements.
|
||||||
Module,
|
Module,
|
||||||
|
/// The code consists of a sequence of interactive statement.
|
||||||
Interactive,
|
Interactive,
|
||||||
|
/// The code consists of a single expression.
|
||||||
Expression,
|
Expression,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,6 +44,7 @@ impl std::str::FromStr for Mode {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returned when a given mode is not valid.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct ModeParseError {
|
pub struct ModeParseError {
|
||||||
_priv: (),
|
_priv: (),
|
||||||
|
|
|
@ -1,9 +1,16 @@
|
||||||
//! Python parsing.
|
//! Contains the interface to the Python parser.
|
||||||
//!
|
//!
|
||||||
//! Use this module to parse python code into an AST.
|
//! Functions in this module can be used to parse Python code into an [Abstract Syntax Tree]
|
||||||
//! There are three ways to parse python code. You could
|
//! (AST) that is then transformed into bytecode.
|
||||||
//! parse a whole program, a single statement, or a single
|
//!
|
||||||
//! expression.
|
//! There are three ways to parse Python code corresponding to the different [`Mode`]s
|
||||||
|
//! defined in the [`mode`] module.
|
||||||
|
//!
|
||||||
|
//! All functions return a [`Result`](std::result::Result) containing the parsed AST or
|
||||||
|
//! a [`ParseError`] if parsing failed.
|
||||||
|
//!
|
||||||
|
//! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree
|
||||||
|
//! [`Mode`]: crate::mode
|
||||||
|
|
||||||
use crate::lexer::{LexResult, Tok};
|
use crate::lexer::{LexResult, Tok};
|
||||||
pub use crate::mode::Mode;
|
pub use crate::mode::Mode;
|
||||||
|
@ -12,13 +19,26 @@ use ast::Location;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
/*
|
/// Parse a full Python program usually consisting of multiple lines.
|
||||||
* Parse python code.
|
///
|
||||||
* Grammar may be inspired by antlr grammar for python:
|
/// This is a convenience function that can be used to parse a full Python program without having to
|
||||||
* https://github.com/antlr/grammars-v4/tree/master/python3
|
/// specify the [`Mode`] or the location. It is probably what you want to use most of the time.
|
||||||
*/
|
///
|
||||||
|
/// # Example
|
||||||
/// Parse a full python program, containing usually multiple lines.
|
///
|
||||||
|
/// For example, parsing a simple function definition and a call to that function:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use rustpython_parser::parser;
|
||||||
|
/// let source = r#"
|
||||||
|
/// def foo():
|
||||||
|
/// return 42
|
||||||
|
///
|
||||||
|
/// print(foo())
|
||||||
|
/// "#;
|
||||||
|
/// let program = parser::parse_program(source, "<embedded>");
|
||||||
|
/// assert!(program.is_ok());
|
||||||
|
/// ```
|
||||||
pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, ParseError> {
|
pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, ParseError> {
|
||||||
parse(source, Mode::Module, source_path).map(|top| match top {
|
parse(source, Mode::Module, source_path).map(|top| match top {
|
||||||
ast::Mod::Module { body, .. } => body,
|
ast::Mod::Module { body, .. } => body,
|
||||||
|
@ -26,49 +46,44 @@ pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, Pars
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parses a python expression
|
/// Parses a single Python expression.
|
||||||
|
///
|
||||||
|
/// This convenience function can be used to parse a single expression without having to
|
||||||
|
/// specify the Mode or the location.
|
||||||
///
|
///
|
||||||
/// # Example
|
/// # Example
|
||||||
/// ```
|
///
|
||||||
|
/// For example, parsing a single expression denoting the addition of two numbers:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
/// extern crate num_bigint;
|
/// extern crate num_bigint;
|
||||||
/// use rustpython_parser::{parser, ast};
|
/// use rustpython_parser::{parser, ast};
|
||||||
/// let expr = parser::parse_expression("1 + 2", "<embedded>").unwrap();
|
/// let expr = parser::parse_expression("1 + 2", "<embedded>");
|
||||||
///
|
///
|
||||||
/// assert_eq!(
|
/// assert!(expr.is_ok());
|
||||||
/// expr,
|
|
||||||
/// ast::Expr {
|
|
||||||
/// location: ast::Location::new(1, 0),
|
|
||||||
/// end_location: Some(ast::Location::new(1, 5)),
|
|
||||||
/// custom: (),
|
|
||||||
/// node: ast::ExprKind::BinOp {
|
|
||||||
/// left: Box::new(ast::Expr {
|
|
||||||
/// location: ast::Location::new(1, 0),
|
|
||||||
/// end_location: Some(ast::Location::new(1, 1)),
|
|
||||||
/// custom: (),
|
|
||||||
/// node: ast::ExprKind::Constant {
|
|
||||||
/// value: ast::Constant::Int(1.into()),
|
|
||||||
/// kind: None,
|
|
||||||
/// }
|
|
||||||
/// }),
|
|
||||||
/// op: ast::Operator::Add,
|
|
||||||
/// right: Box::new(ast::Expr {
|
|
||||||
/// location: ast::Location::new(1, 4),
|
|
||||||
/// end_location: Some(ast::Location::new(1, 5)),
|
|
||||||
/// custom: (),
|
|
||||||
/// node: ast::ExprKind::Constant {
|
|
||||||
/// value: ast::Constant::Int(2.into()),
|
|
||||||
/// kind: None,
|
|
||||||
/// }
|
|
||||||
/// })
|
|
||||||
/// }
|
|
||||||
/// },
|
|
||||||
/// );
|
|
||||||
///
|
///
|
||||||
/// ```
|
/// ```
|
||||||
pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseError> {
|
pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseError> {
|
||||||
parse_expression_located(source, path, Location::new(1, 0))
|
parse_expression_located(source, path, Location::new(1, 0))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parses a Python expression from a given location.
|
||||||
|
///
|
||||||
|
/// This function allows to specify the location of the expression in the source code, other than
|
||||||
|
/// that, it behaves exactly like [`parse_expression`].
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// Parsing a single expression denoting the addition of two numbers, but this time specifying a different,
|
||||||
|
/// somewhat silly, location:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use rustpython_parser::parser::parse_expression_located;
|
||||||
|
/// use rustpython_parser::ast::Location;
|
||||||
|
///
|
||||||
|
/// let expr = parse_expression_located("1 + 2", "<embedded>", Location::new(5, 20));
|
||||||
|
/// assert!(expr.is_ok());
|
||||||
|
/// ```
|
||||||
pub fn parse_expression_located(
|
pub fn parse_expression_located(
|
||||||
source: &str,
|
source: &str,
|
||||||
path: &str,
|
path: &str,
|
||||||
|
@ -80,12 +95,64 @@ pub fn parse_expression_located(
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse a given source code
|
/// Parse the given Python source code using the specified [`Mode`].
|
||||||
|
///
|
||||||
|
/// This function is the most general function to parse Python code. Based on the [`Mode`] supplied,
|
||||||
|
/// it can be used to parse a single expression, a full Python program or an interactive expression.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// If we want to parse a simple expression, we can use the [`Mode::Expression`] mode during
|
||||||
|
/// parsing:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use rustpython_parser::parser::{parse, Mode};
|
||||||
|
///
|
||||||
|
/// let expr = parse("1 + 2", Mode::Expression, "<embedded>");
|
||||||
|
/// assert!(expr.is_ok());
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Alternatively, we can parse a full Python program consisting of multiple lines:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use rustpython_parser::parser::{parse, Mode};
|
||||||
|
///
|
||||||
|
/// let source = r#"
|
||||||
|
/// class Greeter:
|
||||||
|
///
|
||||||
|
/// def greet(self):
|
||||||
|
/// print("Hello, world!")
|
||||||
|
/// "#;
|
||||||
|
/// let program = parse(source, Mode::Module, "<embedded>");
|
||||||
|
/// assert!(program.is_ok());
|
||||||
|
/// ```
|
||||||
pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, ParseError> {
|
pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, ParseError> {
|
||||||
parse_located(source, mode, source_path, Location::new(1, 0))
|
parse_located(source, mode, source_path, Location::new(1, 0))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse a given source code from a given location
|
/// Parse the given Python source code using the specified [`Mode`] and [`Location`].
|
||||||
|
///
|
||||||
|
/// This function allows to specify the location of the the source code, other than
|
||||||
|
/// that, it behaves exactly like [`parse`].
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use rustpython_parser::parser::{parse_located, Mode};
|
||||||
|
/// use rustpython_parser::ast::Location;
|
||||||
|
///
|
||||||
|
/// let source = r#"
|
||||||
|
/// def fib(i):
|
||||||
|
/// a, b = 0, 1
|
||||||
|
/// for _ in range(i):
|
||||||
|
/// a, b = b, a + b
|
||||||
|
/// return a
|
||||||
|
///
|
||||||
|
/// print(fib(42))
|
||||||
|
/// "#;
|
||||||
|
/// let program = parse_located(source, Mode::Module, "<embedded>", Location::new(1, 0));
|
||||||
|
/// assert!(program.is_ok());
|
||||||
|
/// ```
|
||||||
pub fn parse_located(
|
pub fn parse_located(
|
||||||
source: &str,
|
source: &str,
|
||||||
mode: Mode,
|
mode: Mode,
|
||||||
|
@ -96,7 +163,22 @@ pub fn parse_located(
|
||||||
parse_tokens(lxr, mode, source_path)
|
parse_tokens(lxr, mode, source_path)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse a given token iterator.
|
/// Parse an iterator of [`LexResult`]s using the specified [`Mode`].
|
||||||
|
///
|
||||||
|
/// This could allow you to perform some preprocessing on the tokens before parsing them.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// As an example, instead of parsing a string, we can parse a list of tokens after we generate
|
||||||
|
/// them using the [`lexer::make_tokenizer`] function:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use rustpython_parser::parser::{parse_tokens, Mode};
|
||||||
|
/// use rustpython_parser::lexer::make_tokenizer;
|
||||||
|
///
|
||||||
|
/// let expr = parse_tokens(make_tokenizer("1 + 2"), Mode::Expression, "<embedded>");
|
||||||
|
/// assert!(expr.is_ok());
|
||||||
|
/// ```
|
||||||
pub fn parse_tokens(
|
pub fn parse_tokens(
|
||||||
lxr: impl IntoIterator<Item = LexResult>,
|
lxr: impl IntoIterator<Item = LexResult>,
|
||||||
mode: Mode,
|
mode: Mode,
|
||||||
|
@ -328,4 +410,13 @@ with (0 as a, 1 as b,): pass
|
||||||
let parse_ast = parse_expression(r#"{"a": "b", **c, "d": "e"}"#, "<test>").unwrap();
|
let parse_ast = parse_expression(r#"{"a": "b", **c, "d": "e"}"#, "<test>").unwrap();
|
||||||
insta::assert_debug_snapshot!(parse_ast);
|
insta::assert_debug_snapshot!(parse_ast);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_modes() {
|
||||||
|
let source = "a[0][1][2][3][4]";
|
||||||
|
|
||||||
|
assert!(parse(&source, Mode::Expression, "<embedded>").is_ok());
|
||||||
|
assert!(parse(&source, Mode::Module, "<embedded>").is_ok());
|
||||||
|
assert!(parse(&source, Mode::Interactive, "<embedded>").is_ok());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,86 +1,154 @@
|
||||||
//! Different token definitions.
|
//! Token type for Python source code created by the lexer and consumed by the parser.
|
||||||
//! Loosely based on token.h from CPython source:
|
//!
|
||||||
|
//! This module defines the tokens that the lexer recognizes. The tokens are
|
||||||
|
//! loosely based on the token definitions found in the [CPython source].
|
||||||
|
//!
|
||||||
|
//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h
|
||||||
use num_bigint::BigInt;
|
use num_bigint::BigInt;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
/// Python source code can be tokenized in a sequence of these tokens.
|
/// The set of tokens the Python source code can be tokenized in.
|
||||||
#[derive(Clone, Debug, PartialEq)]
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
pub enum Tok {
|
pub enum Tok {
|
||||||
|
/// Token value for a name, commonly known as an identifier.
|
||||||
Name {
|
Name {
|
||||||
|
/// The name value.
|
||||||
name: String,
|
name: String,
|
||||||
},
|
},
|
||||||
|
/// Token value for an integer.
|
||||||
Int {
|
Int {
|
||||||
|
/// The integer value.
|
||||||
value: BigInt,
|
value: BigInt,
|
||||||
},
|
},
|
||||||
|
/// Token value for a floating point number.
|
||||||
Float {
|
Float {
|
||||||
|
/// The float value.
|
||||||
value: f64,
|
value: f64,
|
||||||
},
|
},
|
||||||
|
/// Token value for a complex number.
|
||||||
Complex {
|
Complex {
|
||||||
|
/// The real part of the complex number.
|
||||||
real: f64,
|
real: f64,
|
||||||
|
/// The imaginary part of the complex number.
|
||||||
imag: f64,
|
imag: f64,
|
||||||
},
|
},
|
||||||
|
/// Token value for a string.
|
||||||
String {
|
String {
|
||||||
|
/// The string value.
|
||||||
value: String,
|
value: String,
|
||||||
|
/// The kind of string.
|
||||||
kind: StringKind,
|
kind: StringKind,
|
||||||
|
/// Whether the string is triple quoted.
|
||||||
triple_quoted: bool,
|
triple_quoted: bool,
|
||||||
},
|
},
|
||||||
Newline,
|
/// Token value for a comment. These are filtered out of the token stream prior to parsing.
|
||||||
NonLogicalNewline,
|
|
||||||
Indent,
|
|
||||||
Dedent,
|
|
||||||
StartModule,
|
|
||||||
StartInteractive,
|
|
||||||
StartExpression,
|
|
||||||
EndOfFile,
|
|
||||||
Lpar,
|
|
||||||
Rpar,
|
|
||||||
Lsqb,
|
|
||||||
Rsqb,
|
|
||||||
Colon,
|
|
||||||
Comma,
|
|
||||||
Comment(String),
|
Comment(String),
|
||||||
|
/// Token value for a newline.
|
||||||
|
Newline,
|
||||||
|
/// Token value for a newline that is not a logical line break. These are filtered out of
|
||||||
|
/// the token stream prior to parsing.
|
||||||
|
NonLogicalNewline,
|
||||||
|
/// Token value for an indent.
|
||||||
|
Indent,
|
||||||
|
/// Token value for a dedent.
|
||||||
|
Dedent,
|
||||||
|
EndOfFile,
|
||||||
|
/// Token value for a left parenthesis `(`.
|
||||||
|
Lpar,
|
||||||
|
/// Token value for a right parenthesis `)`.
|
||||||
|
Rpar,
|
||||||
|
/// Token value for a left square bracket `[`.
|
||||||
|
Lsqb,
|
||||||
|
/// Token value for a right square bracket `]`.
|
||||||
|
Rsqb,
|
||||||
|
/// Token value for a colon `:`.
|
||||||
|
Colon,
|
||||||
|
/// Token value for a comma `,`.
|
||||||
|
Comma,
|
||||||
|
/// Token value for a semicolon `;`.
|
||||||
Semi,
|
Semi,
|
||||||
|
/// Token value for plus `+`.
|
||||||
Plus,
|
Plus,
|
||||||
|
/// Token value for minus `-`.
|
||||||
Minus,
|
Minus,
|
||||||
|
/// Token value for star `*`.
|
||||||
Star,
|
Star,
|
||||||
|
/// Token value for slash `/`.
|
||||||
Slash,
|
Slash,
|
||||||
Vbar, // '|'
|
/// Token value for vertical bar `|`.
|
||||||
Amper, // '&'
|
Vbar,
|
||||||
|
/// Token value for ampersand `&`.
|
||||||
|
Amper,
|
||||||
|
/// Token value for less than `<`.
|
||||||
Less,
|
Less,
|
||||||
|
/// Token value for greater than `>`.
|
||||||
Greater,
|
Greater,
|
||||||
|
/// Token value for equal `=`.
|
||||||
Equal,
|
Equal,
|
||||||
|
/// Token value for dot `.`.
|
||||||
Dot,
|
Dot,
|
||||||
|
/// Token value for percent `%`.
|
||||||
Percent,
|
Percent,
|
||||||
|
/// Token value for left bracket `{`.
|
||||||
Lbrace,
|
Lbrace,
|
||||||
|
/// Token value for right bracket `}`.
|
||||||
Rbrace,
|
Rbrace,
|
||||||
|
/// Token value for double equal `==`.
|
||||||
EqEqual,
|
EqEqual,
|
||||||
|
/// Token value for not equal `!=`.
|
||||||
NotEqual,
|
NotEqual,
|
||||||
|
/// Token value for less than or equal `<=`.
|
||||||
LessEqual,
|
LessEqual,
|
||||||
|
/// Token value for greater than or equal `>=`.
|
||||||
GreaterEqual,
|
GreaterEqual,
|
||||||
|
/// Token value for tilde `~`.
|
||||||
Tilde,
|
Tilde,
|
||||||
|
/// Token value for caret `^`.
|
||||||
CircumFlex,
|
CircumFlex,
|
||||||
|
/// Token value for left shift `<<`.
|
||||||
LeftShift,
|
LeftShift,
|
||||||
|
/// Token value for right shift `>>`.
|
||||||
RightShift,
|
RightShift,
|
||||||
|
/// Token value for double star `**`.
|
||||||
DoubleStar,
|
DoubleStar,
|
||||||
DoubleStarEqual, // '**='
|
/// Token value for double star equal `**=`.
|
||||||
|
DoubleStarEqual,
|
||||||
|
/// Token value for plus equal `+=`.
|
||||||
PlusEqual,
|
PlusEqual,
|
||||||
|
/// Token value for minus equal `-=`.
|
||||||
MinusEqual,
|
MinusEqual,
|
||||||
|
/// Token value for star equal `*=`.
|
||||||
StarEqual,
|
StarEqual,
|
||||||
|
/// Token value for slash equal `/=`.
|
||||||
SlashEqual,
|
SlashEqual,
|
||||||
|
/// Token value for percent equal `%=`.
|
||||||
PercentEqual,
|
PercentEqual,
|
||||||
AmperEqual, // '&='
|
/// Token value for ampersand equal `&=`.
|
||||||
|
AmperEqual,
|
||||||
|
/// Token value for vertical bar equal `|=`.
|
||||||
VbarEqual,
|
VbarEqual,
|
||||||
CircumflexEqual, // '^='
|
/// Token value for caret equal `^=`.
|
||||||
|
CircumflexEqual,
|
||||||
|
/// Token value for left shift equal `<<=`.
|
||||||
LeftShiftEqual,
|
LeftShiftEqual,
|
||||||
|
/// Token value for right shift equal `>>=`.
|
||||||
RightShiftEqual,
|
RightShiftEqual,
|
||||||
DoubleSlash, // '//'
|
/// Token value for double slash `//`.
|
||||||
|
DoubleSlash,
|
||||||
|
/// Token value for double slash equal `//=`.
|
||||||
DoubleSlashEqual,
|
DoubleSlashEqual,
|
||||||
|
/// Token value for colon equal `:=`.
|
||||||
ColonEqual,
|
ColonEqual,
|
||||||
|
/// Token value for at `@`.
|
||||||
At,
|
At,
|
||||||
|
/// Token value for at equal `@=`.
|
||||||
AtEqual,
|
AtEqual,
|
||||||
|
/// Token value for arrow `->`.
|
||||||
Rarrow,
|
Rarrow,
|
||||||
|
/// Token value for ellipsis `...`.
|
||||||
Ellipsis,
|
Ellipsis,
|
||||||
|
|
||||||
|
// Self documenting.
|
||||||
// Keywords (alphabetically):
|
// Keywords (alphabetically):
|
||||||
False,
|
False,
|
||||||
None,
|
None,
|
||||||
|
@ -118,6 +186,11 @@ pub enum Tok {
|
||||||
While,
|
While,
|
||||||
With,
|
With,
|
||||||
Yield,
|
Yield,
|
||||||
|
|
||||||
|
// RustPython specific.
|
||||||
|
StartModule,
|
||||||
|
StartInteractive,
|
||||||
|
StartExpression,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for Tok {
|
impl fmt::Display for Tok {
|
||||||
|
@ -231,14 +304,25 @@ impl fmt::Display for Tok {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The kind of string literal as described in the [String and Bytes literals]
|
||||||
|
/// section of the Python reference.
|
||||||
|
///
|
||||||
|
/// [String and Bytes literals]: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||||
#[derive(PartialEq, Eq, Debug, Clone)]
|
#[derive(PartialEq, Eq, Debug, Clone)]
|
||||||
pub enum StringKind {
|
pub enum StringKind {
|
||||||
|
/// A normal string literal with no prefix.
|
||||||
String,
|
String,
|
||||||
|
/// A f-string literal, with a `f` or `F` prefix.
|
||||||
FString,
|
FString,
|
||||||
|
/// A byte string literal, with a `b` or `B` prefix.
|
||||||
Bytes,
|
Bytes,
|
||||||
|
/// A raw string literal, with a `r` or `R` prefix.
|
||||||
RawString,
|
RawString,
|
||||||
|
/// A raw f-string literal, with a `rf`/`fr` or `rF`/`Fr` or `Rf`/`fR` or `RF`/`FR` prefix.
|
||||||
RawFString,
|
RawFString,
|
||||||
|
/// A raw byte string literal, with a `rb`/`br` or `rB`/`Br` or `Rb`/`bR` or `RB`/`BR` prefix.
|
||||||
RawBytes,
|
RawBytes,
|
||||||
|
/// A unicode string literal, with a `u` or `U` prefix.
|
||||||
Unicode,
|
Unicode,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -286,25 +370,33 @@ impl fmt::Display for StringKind {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StringKind {
|
impl StringKind {
|
||||||
|
/// Returns true if the string is a raw string, i,e one of
|
||||||
|
/// [`StringKind::RawString`] or [`StringKind::RawFString`] or [`StringKind::RawBytes`].
|
||||||
pub fn is_raw(&self) -> bool {
|
pub fn is_raw(&self) -> bool {
|
||||||
use StringKind::{RawBytes, RawFString, RawString};
|
use StringKind::{RawBytes, RawFString, RawString};
|
||||||
matches!(self, RawString | RawFString | RawBytes)
|
matches!(self, RawString | RawFString | RawBytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if the string is an f-string, i,e one of
|
||||||
|
/// [`StringKind::FString`] or [`StringKind::RawFString`].
|
||||||
pub fn is_fstring(&self) -> bool {
|
pub fn is_fstring(&self) -> bool {
|
||||||
use StringKind::{FString, RawFString};
|
use StringKind::{FString, RawFString};
|
||||||
matches!(self, FString | RawFString)
|
matches!(self, FString | RawFString)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if the string is a byte string, i,e one of
|
||||||
|
/// [`StringKind::Bytes`] or [`StringKind::RawBytes`].
|
||||||
pub fn is_bytes(&self) -> bool {
|
pub fn is_bytes(&self) -> bool {
|
||||||
use StringKind::{Bytes, RawBytes};
|
use StringKind::{Bytes, RawBytes};
|
||||||
matches!(self, Bytes | RawBytes)
|
matches!(self, Bytes | RawBytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if the string is a unicode string, i,e [`StringKind::Unicode`].
|
||||||
pub fn is_unicode(&self) -> bool {
|
pub fn is_unicode(&self) -> bool {
|
||||||
matches!(self, StringKind::Unicode)
|
matches!(self, StringKind::Unicode)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the number of characters in the prefix.
|
||||||
pub fn prefix_len(&self) -> usize {
|
pub fn prefix_len(&self) -> usize {
|
||||||
use StringKind::*;
|
use StringKind::*;
|
||||||
match self {
|
match self {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue