mirror of
https://github.com/RustPython/Parser.git
synced 2025-07-26 06:24:29 +00:00
Merge pull request #4543 from youknowone/flatten-parser
Flatten parser interface
This commit is contained in:
commit
a19f294b0c
12 changed files with 521 additions and 542 deletions
|
@ -12,13 +12,18 @@
|
|||
//! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree
|
||||
//! [`Mode`]: crate::mode
|
||||
|
||||
use crate::lexer::{LexResult, Tok};
|
||||
pub use crate::mode::Mode;
|
||||
use crate::{ast, error::ParseError, lexer, python};
|
||||
use ast::Location;
|
||||
use crate::{
|
||||
ast::{self, Location},
|
||||
lexer::{self, LexResult, LexicalError, LexicalErrorType},
|
||||
mode::Mode,
|
||||
python,
|
||||
token::Tok,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use std::iter;
|
||||
|
||||
pub(super) use lalrpop_util::ParseError as LalrpopError;
|
||||
|
||||
/// Parse a full Python program usually consisting of multiple lines.
|
||||
///
|
||||
/// This is a convenience function that can be used to parse a full Python program without having to
|
||||
|
@ -29,7 +34,7 @@ use std::iter;
|
|||
/// For example, parsing a simple function definition and a call to that function:
|
||||
///
|
||||
/// ```
|
||||
/// use rustpython_parser::parser;
|
||||
/// use rustpython_parser as parser;
|
||||
/// let source = r#"
|
||||
/// def foo():
|
||||
/// return 42
|
||||
|
@ -57,7 +62,7 @@ pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, Pars
|
|||
///
|
||||
/// ```
|
||||
/// extern crate num_bigint;
|
||||
/// use rustpython_parser::{parser, ast};
|
||||
/// use rustpython_parser as parser;
|
||||
/// let expr = parser::parse_expression("1 + 2", "<embedded>");
|
||||
///
|
||||
/// assert!(expr.is_ok());
|
||||
|
@ -78,8 +83,7 @@ pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseErro
|
|||
/// somewhat silly, location:
|
||||
///
|
||||
/// ```
|
||||
/// use rustpython_parser::parser::parse_expression_located;
|
||||
/// use rustpython_parser::ast::Location;
|
||||
/// use rustpython_parser::{ast::Location, parse_expression_located};
|
||||
///
|
||||
/// let expr = parse_expression_located("1 + 2", "<embedded>", Location::new(5, 20));
|
||||
/// assert!(expr.is_ok());
|
||||
|
@ -106,8 +110,7 @@ pub fn parse_expression_located(
|
|||
/// parsing:
|
||||
///
|
||||
/// ```
|
||||
/// use rustpython_parser::mode::Mode;
|
||||
/// use rustpython_parser::parser::parse;
|
||||
/// use rustpython_parser::{Mode, parse};
|
||||
///
|
||||
/// let expr = parse("1 + 2", Mode::Expression, "<embedded>");
|
||||
/// assert!(expr.is_ok());
|
||||
|
@ -116,8 +119,7 @@ pub fn parse_expression_located(
|
|||
/// Alternatively, we can parse a full Python program consisting of multiple lines:
|
||||
///
|
||||
/// ```
|
||||
/// use rustpython_parser::mode::Mode;
|
||||
/// use rustpython_parser::parser::parse;
|
||||
/// use rustpython_parser::{Mode, parse};
|
||||
///
|
||||
/// let source = r#"
|
||||
/// class Greeter:
|
||||
|
@ -140,9 +142,7 @@ pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, Pa
|
|||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use rustpython_parser::ast::Location;
|
||||
/// use rustpython_parser::mode::Mode;
|
||||
/// use rustpython_parser::parser::parse_located;
|
||||
/// use rustpython_parser::{ast::Location, Mode, parse_located};
|
||||
///
|
||||
/// let source = r#"
|
||||
/// def fib(i):
|
||||
|
@ -162,7 +162,7 @@ pub fn parse_located(
|
|||
source_path: &str,
|
||||
location: Location,
|
||||
) -> Result<ast::Mod, ParseError> {
|
||||
let lxr = lexer::make_tokenizer_located(source, mode, location);
|
||||
let lxr = lexer::lex_located(source, mode, location);
|
||||
parse_tokens(lxr, mode, source_path)
|
||||
}
|
||||
|
||||
|
@ -173,14 +173,12 @@ pub fn parse_located(
|
|||
/// # Example
|
||||
///
|
||||
/// As an example, instead of parsing a string, we can parse a list of tokens after we generate
|
||||
/// them using the [`lexer::make_tokenizer`] function:
|
||||
/// them using the [`lexer::lex`] function:
|
||||
///
|
||||
/// ```
|
||||
/// use rustpython_parser::lexer::make_tokenizer;
|
||||
/// use rustpython_parser::mode::Mode;
|
||||
/// use rustpython_parser::parser::parse_tokens;
|
||||
/// use rustpython_parser::{lexer::lex, Mode, parse_tokens};
|
||||
///
|
||||
/// let expr = parse_tokens(make_tokenizer("1 + 2", Mode::Expression), Mode::Expression, "<embedded>");
|
||||
/// let expr = parse_tokens(lex("1 + 2", Mode::Expression), Mode::Expression, "<embedded>");
|
||||
/// assert!(expr.is_ok());
|
||||
/// ```
|
||||
pub fn parse_tokens(
|
||||
|
@ -189,12 +187,127 @@ pub fn parse_tokens(
|
|||
source_path: &str,
|
||||
) -> Result<ast::Mod, ParseError> {
|
||||
let marker_token = (Default::default(), mode.to_marker(), Default::default());
|
||||
let tokenizer = iter::once(Ok(marker_token))
|
||||
let lexer = iter::once(Ok(marker_token))
|
||||
.chain(lxr)
|
||||
.filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
|
||||
python::TopParser::new()
|
||||
.parse(tokenizer.into_iter())
|
||||
.map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path))
|
||||
.parse(lexer.into_iter())
|
||||
.map_err(|e| parse_error_from_lalrpop(e, source_path))
|
||||
}
|
||||
|
||||
/// Represents represent errors that occur during parsing and are
|
||||
/// returned by the `parse_*` functions.
|
||||
pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
|
||||
|
||||
/// Represents the different types of errors that can occur during parsing.
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
pub enum ParseErrorType {
|
||||
/// Parser encountered an unexpected end of input
|
||||
Eof,
|
||||
/// Parser encountered an extra token
|
||||
ExtraToken(Tok),
|
||||
/// Parser encountered an invalid token
|
||||
InvalidToken,
|
||||
/// Parser encountered an unexpected token
|
||||
UnrecognizedToken(Tok, Option<String>),
|
||||
// Maps to `User` type from `lalrpop-util`
|
||||
/// Parser encountered an error during lexing.
|
||||
Lexical(LexicalErrorType),
|
||||
}
|
||||
|
||||
// Convert `lalrpop_util::ParseError` to our internal type
|
||||
fn parse_error_from_lalrpop(
|
||||
err: LalrpopError<Location, Tok, LexicalError>,
|
||||
source_path: &str,
|
||||
) -> ParseError {
|
||||
let source_path = source_path.to_owned();
|
||||
match err {
|
||||
// TODO: Are there cases where this isn't an EOF?
|
||||
LalrpopError::InvalidToken { location } => ParseError {
|
||||
error: ParseErrorType::Eof,
|
||||
location,
|
||||
source_path,
|
||||
},
|
||||
LalrpopError::ExtraToken { token } => ParseError {
|
||||
error: ParseErrorType::ExtraToken(token.1),
|
||||
location: token.0,
|
||||
source_path,
|
||||
},
|
||||
LalrpopError::User { error } => ParseError {
|
||||
error: ParseErrorType::Lexical(error.error),
|
||||
location: error.location,
|
||||
source_path,
|
||||
},
|
||||
LalrpopError::UnrecognizedToken { token, expected } => {
|
||||
// Hacky, but it's how CPython does it. See PyParser_AddToken,
|
||||
// in particular "Only one possible expected token" comment.
|
||||
let expected = (expected.len() == 1).then(|| expected[0].clone());
|
||||
ParseError {
|
||||
error: ParseErrorType::UnrecognizedToken(token.1, expected),
|
||||
location: token.0.with_col_offset(1),
|
||||
source_path,
|
||||
}
|
||||
}
|
||||
LalrpopError::UnrecognizedEOF { location, expected } => {
|
||||
// This could be an initial indentation error that we should ignore
|
||||
let indent_error = expected == ["Indent"];
|
||||
if indent_error {
|
||||
ParseError {
|
||||
error: ParseErrorType::Lexical(LexicalErrorType::IndentationError),
|
||||
location,
|
||||
source_path,
|
||||
}
|
||||
} else {
|
||||
ParseError {
|
||||
error: ParseErrorType::Eof,
|
||||
location,
|
||||
source_path,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ParseErrorType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match *self {
|
||||
ParseErrorType::Eof => write!(f, "Got unexpected EOF"),
|
||||
ParseErrorType::ExtraToken(ref tok) => write!(f, "Got extraneous token: {tok:?}"),
|
||||
ParseErrorType::InvalidToken => write!(f, "Got invalid token"),
|
||||
ParseErrorType::UnrecognizedToken(ref tok, ref expected) => {
|
||||
if *tok == Tok::Indent {
|
||||
write!(f, "unexpected indent")
|
||||
} else if expected.as_deref() == Some("Indent") {
|
||||
write!(f, "expected an indented block")
|
||||
} else {
|
||||
write!(f, "invalid syntax. Got unexpected token {tok}")
|
||||
}
|
||||
}
|
||||
ParseErrorType::Lexical(ref error) => write!(f, "{error}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ParseErrorType {
|
||||
/// Returns true if the error is an indentation error.
|
||||
pub fn is_indentation_error(&self) -> bool {
|
||||
match self {
|
||||
ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
|
||||
ParseErrorType::UnrecognizedToken(token, expected) => {
|
||||
*token == Tok::Indent || expected.clone() == Some("Indent".to_owned())
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the error is a tab error.
|
||||
pub fn is_tab_error(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
ParseErrorType::Lexical(LexicalErrorType::TabError)
|
||||
| ParseErrorType::Lexical(LexicalErrorType::TabsAfterSpaces)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue