Merge pull request #4543 from youknowone/flatten-parser

Flatten parser interface
2025-07-26 06:24:29 +00:00 · 2023-02-23 02:09:46 +09:00 · 2023-02-23 02:09:46 +09:00 · a19f294b0c
commit a19f294b0c
parent b61f4d7b69 e26369a34e
12 changed files with 521 additions and 542 deletions
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -12,13 +12,18 @@
 //! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree
 //! [`Mode`]: crate::mode

-use crate::lexer::{LexResult, Tok};
-pub use crate::mode::Mode;
-use crate::{ast, error::ParseError, lexer, python};
-use ast::Location;
+use crate::{
+    ast::{self, Location},
+    lexer::{self, LexResult, LexicalError, LexicalErrorType},
+    mode::Mode,
+    python,
+    token::Tok,
+};
 use itertools::Itertools;
 use std::iter;

+pub(super) use lalrpop_util::ParseError as LalrpopError;
+
 /// Parse a full Python program usually consisting of multiple lines.
 ///  
 /// This is a convenience function that can be used to parse a full Python program without having to
@ -29,7 +34,7 @@ use std::iter;
 /// For example, parsing a simple function definition and a call to that function:
 ///
 /// ```
-/// use rustpython_parser::parser;
+/// use rustpython_parser as parser;
 /// let source = r#"
 /// def foo():
 ///    return 42
@ -57,7 +62,7 @@ pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, Pars
 ///
 ///  ```
 /// extern crate num_bigint;
-/// use rustpython_parser::{parser, ast};
+/// use rustpython_parser as parser;
 /// let expr = parser::parse_expression("1 + 2", "<embedded>");
 ///
 /// assert!(expr.is_ok());
@ -78,8 +83,7 @@ pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseErro
 /// somewhat silly, location:
 ///
 /// ```
-/// use rustpython_parser::parser::parse_expression_located;
-/// use rustpython_parser::ast::Location;
+/// use rustpython_parser::{ast::Location, parse_expression_located};
 ///
 /// let expr = parse_expression_located("1 + 2", "<embedded>", Location::new(5, 20));
 /// assert!(expr.is_ok());
@ -106,8 +110,7 @@ pub fn parse_expression_located(
 /// parsing:
 ///
 /// ```
-/// use rustpython_parser::mode::Mode;
-/// use rustpython_parser::parser::parse;
+/// use rustpython_parser::{Mode, parse};
 ///
 /// let expr = parse("1 + 2", Mode::Expression, "<embedded>");
 /// assert!(expr.is_ok());
@ -116,8 +119,7 @@ pub fn parse_expression_located(
 /// Alternatively, we can parse a full Python program consisting of multiple lines:
 ///
 /// ```
-/// use rustpython_parser::mode::Mode;
-/// use rustpython_parser::parser::parse;
+/// use rustpython_parser::{Mode, parse};
 ///
 /// let source = r#"
 /// class Greeter:
@ -140,9 +142,7 @@ pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, Pa
 /// # Example
 ///
 /// ```
-/// use rustpython_parser::ast::Location;
-/// use rustpython_parser::mode::Mode;
-/// use rustpython_parser::parser::parse_located;
+/// use rustpython_parser::{ast::Location, Mode, parse_located};
 ///
 /// let source = r#"
 /// def fib(i):
@ -162,7 +162,7 @@ pub fn parse_located(
    source_path: &str,
    location: Location,
 ) -> Result<ast::Mod, ParseError> {
-    let lxr = lexer::make_tokenizer_located(source, mode, location);
+    let lxr = lexer::lex_located(source, mode, location);
    parse_tokens(lxr, mode, source_path)
 }

@ -173,14 +173,12 @@ pub fn parse_located(
 /// # Example
 ///
 /// As an example, instead of parsing a string, we can parse a list of tokens after we generate
-/// them using the [`lexer::make_tokenizer`] function:
+/// them using the [`lexer::lex`] function:
 ///
 /// ```
-/// use rustpython_parser::lexer::make_tokenizer;
-/// use rustpython_parser::mode::Mode;
-/// use rustpython_parser::parser::parse_tokens;
+/// use rustpython_parser::{lexer::lex, Mode, parse_tokens};
 ///
-/// let expr = parse_tokens(make_tokenizer("1 + 2", Mode::Expression), Mode::Expression, "<embedded>");
+/// let expr = parse_tokens(lex("1 + 2", Mode::Expression), Mode::Expression, "<embedded>");
 /// assert!(expr.is_ok());
 /// ```
 pub fn parse_tokens(
@ -189,12 +187,127 @@ pub fn parse_tokens(
    source_path: &str,
 ) -> Result<ast::Mod, ParseError> {
    let marker_token = (Default::default(), mode.to_marker(), Default::default());
-    let tokenizer = iter::once(Ok(marker_token))
+    let lexer = iter::once(Ok(marker_token))
        .chain(lxr)
        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
    python::TopParser::new()
-        .parse(tokenizer.into_iter())
-        .map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path))
+        .parse(lexer.into_iter())
+        .map_err(|e| parse_error_from_lalrpop(e, source_path))
+}
+
+/// Represents represent errors that occur during parsing and are
+/// returned by the `parse_*` functions.
+pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
+
+/// Represents the different types of errors that can occur during parsing.
+#[derive(Debug, PartialEq, thiserror::Error)]
+pub enum ParseErrorType {
+    /// Parser encountered an unexpected end of input
+    Eof,
+    /// Parser encountered an extra token
+    ExtraToken(Tok),
+    /// Parser encountered an invalid token
+    InvalidToken,
+    /// Parser encountered an unexpected token
+    UnrecognizedToken(Tok, Option<String>),
+    // Maps to `User` type from `lalrpop-util`
+    /// Parser encountered an error during lexing.
+    Lexical(LexicalErrorType),
+}
+
+// Convert `lalrpop_util::ParseError` to our internal type
+fn parse_error_from_lalrpop(
+    err: LalrpopError<Location, Tok, LexicalError>,
+    source_path: &str,
+) -> ParseError {
+    let source_path = source_path.to_owned();
+    match err {
+        // TODO: Are there cases where this isn't an EOF?
+        LalrpopError::InvalidToken { location } => ParseError {
+            error: ParseErrorType::Eof,
+            location,
+            source_path,
+        },
+        LalrpopError::ExtraToken { token } => ParseError {
+            error: ParseErrorType::ExtraToken(token.1),
+            location: token.0,
+            source_path,
+        },
+        LalrpopError::User { error } => ParseError {
+            error: ParseErrorType::Lexical(error.error),
+            location: error.location,
+            source_path,
+        },
+        LalrpopError::UnrecognizedToken { token, expected } => {
+            // Hacky, but it's how CPython does it. See PyParser_AddToken,
+            // in particular "Only one possible expected token" comment.
+            let expected = (expected.len() == 1).then(|| expected[0].clone());
+            ParseError {
+                error: ParseErrorType::UnrecognizedToken(token.1, expected),
+                location: token.0.with_col_offset(1),
+                source_path,
+            }
+        }
+        LalrpopError::UnrecognizedEOF { location, expected } => {
+            // This could be an initial indentation error that we should ignore
+            let indent_error = expected == ["Indent"];
+            if indent_error {
+                ParseError {
+                    error: ParseErrorType::Lexical(LexicalErrorType::IndentationError),
+                    location,
+                    source_path,
+                }
+            } else {
+                ParseError {
+                    error: ParseErrorType::Eof,
+                    location,
+                    source_path,
+                }
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for ParseErrorType {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match *self {
+            ParseErrorType::Eof => write!(f, "Got unexpected EOF"),
+            ParseErrorType::ExtraToken(ref tok) => write!(f, "Got extraneous token: {tok:?}"),
+            ParseErrorType::InvalidToken => write!(f, "Got invalid token"),
+            ParseErrorType::UnrecognizedToken(ref tok, ref expected) => {
+                if *tok == Tok::Indent {
+                    write!(f, "unexpected indent")
+                } else if expected.as_deref() == Some("Indent") {
+                    write!(f, "expected an indented block")
+                } else {
+                    write!(f, "invalid syntax. Got unexpected token {tok}")
+                }
+            }
+            ParseErrorType::Lexical(ref error) => write!(f, "{error}"),
+        }
+    }
+}
+
+impl ParseErrorType {
+    /// Returns true if the error is an indentation error.
+    pub fn is_indentation_error(&self) -> bool {
+        match self {
+            ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
+            ParseErrorType::UnrecognizedToken(token, expected) => {
+                *token == Tok::Indent || expected.clone() == Some("Indent".to_owned())
+            }
+            _ => false,
+        }
+    }
+
+    /// Returns true if the error is a tab error.
+    pub fn is_tab_error(&self) -> bool {
+        matches!(
+            self,
+            ParseErrorType::Lexical(LexicalErrorType::TabError)
+                | ParseErrorType::Lexical(LexicalErrorType::TabsAfterSpaces)
+        )
+    }
 }

 #[cfg(test)]