Merge pull request #4543 from youknowone/flatten-parser

Flatten parser interface
This commit is contained in:
Jeong YunWon 2023-02-23 02:09:46 +09:00 committed by GitHub
commit 822f0936ca
14 changed files with 528 additions and 552 deletions

View file

@ -1,5 +1,3 @@
use std::fmt::Error;
use num_bigint::BigInt; use num_bigint::BigInt;
pub use rustpython_compiler_core::ConversionFlag; pub use rustpython_compiler_core::ConversionFlag;
@ -44,7 +42,9 @@ impl std::fmt::Display for Constant {
Constant::None => f.pad("None"), Constant::None => f.pad("None"),
Constant::Bool(b) => f.pad(if *b { "True" } else { "False" }), Constant::Bool(b) => f.pad(if *b { "True" } else { "False" }),
Constant::Str(s) => rustpython_common::str::repr(s).fmt(f), Constant::Str(s) => rustpython_common::str::repr(s).fmt(f),
Constant::Bytes(b) => f.pad(&rustpython_common::bytes::repr(b).map_err(|_err| Error)?), Constant::Bytes(b) => {
f.pad(&rustpython_common::bytes::repr(b).map_err(|_err| std::fmt::Error)?)
}
Constant::Int(i) => i.fmt(f), Constant::Int(i) => i.fmt(f),
Constant::Tuple(tup) => { Constant::Tuple(tup) => {
if let [elt] = &**tup { if let [elt] = &**tup {
@ -133,12 +133,12 @@ impl<U> crate::fold::Fold<U> for ConstantOptimizer {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
#[cfg(feature = "constant-optimization")] #[cfg(feature = "constant-optimization")]
#[test] #[test]
fn test_constant_opt() { fn test_constant_opt() {
use super::*; use crate::{fold::Fold, *};
use crate::fold::Fold;
use crate::*;
let start = Default::default(); let start = Default::default();
let end = None; let end = None;

View file

@ -86,7 +86,7 @@ impl CompileContext {
} }
} }
/// Compile an ast::Mod produced from rustpython_parser::parser::parse() /// Compile an ast::Mod produced from rustpython_parser::parse()
pub fn compile_top( pub fn compile_top(
ast: &ast::Mod, ast: &ast::Mod,
source_path: String, source_path: String,
@ -2843,10 +2843,8 @@ fn compile_constant(value: &ast::Constant) -> ConstantData {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{CompileOpts, Compiler}; use super::*;
use crate::symboltable::SymbolTable; use rustpython_parser as parser;
use rustpython_compiler_core::CodeObject;
use rustpython_parser::parser;
fn compile_exec(source: &str) -> CodeObject { fn compile_exec(source: &str) -> CodeObject {
let mut compiler: Compiler = Compiler::new( let mut compiler: Compiler = Compiler::new(

View file

@ -96,7 +96,7 @@ impl Location {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::Location; use super::*;
#[test] #[test]
fn test_gt() { fn test_gt() {

View file

@ -15,18 +15,16 @@ impl std::str::FromStr for Mode {
"exec" => Ok(Mode::Exec), "exec" => Ok(Mode::Exec),
"eval" => Ok(Mode::Eval), "eval" => Ok(Mode::Eval),
"single" => Ok(Mode::Single), "single" => Ok(Mode::Single),
_ => Err(ModeParseError { _priv: () }), _ => Err(ModeParseError(())),
} }
} }
} }
#[derive(Debug)] #[derive(Debug)]
pub struct ModeParseError { pub struct ModeParseError(());
_priv: (),
}
impl std::fmt::Display for ModeParseError { impl std::fmt::Display for ModeParseError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, r#"mode should be "exec", "eval", or "single""#) write!(f, r#"mode must be "exec", "eval", or "single""#)
} }
} }

View file

@ -5,12 +5,11 @@
use crate::{ use crate::{
ast, ast,
error::{LexicalError, LexicalErrorType}, lexer::{LexicalError, LexicalErrorType},
function::{ArgumentList, parse_args, parse_params, validate_arguments}, function::{ArgumentList, parse_args, parse_params, validate_arguments},
lexer,
context::set_context, context::set_context,
string::parse_strings, string::parse_strings,
token::StringKind, token::{self, StringKind},
}; };
use num_bigint::BigInt; use num_bigint::BigInt;
@ -1937,106 +1936,106 @@ extern {
type Location = ast::Location; type Location = ast::Location;
type Error = LexicalError; type Error = LexicalError;
enum lexer::Tok { enum token::Tok {
Indent => lexer::Tok::Indent, Indent => token::Tok::Indent,
Dedent => lexer::Tok::Dedent, Dedent => token::Tok::Dedent,
StartModule => lexer::Tok::StartModule, StartModule => token::Tok::StartModule,
StartInteractive => lexer::Tok::StartInteractive, StartInteractive => token::Tok::StartInteractive,
StartExpression => lexer::Tok::StartExpression, StartExpression => token::Tok::StartExpression,
"+" => lexer::Tok::Plus, "+" => token::Tok::Plus,
"-" => lexer::Tok::Minus, "-" => token::Tok::Minus,
"~" => lexer::Tok::Tilde, "~" => token::Tok::Tilde,
":" => lexer::Tok::Colon, ":" => token::Tok::Colon,
"." => lexer::Tok::Dot, "." => token::Tok::Dot,
"..." => lexer::Tok::Ellipsis, "..." => token::Tok::Ellipsis,
"," => lexer::Tok::Comma, "," => token::Tok::Comma,
"*" => lexer::Tok::Star, "*" => token::Tok::Star,
"**" => lexer::Tok::DoubleStar, "**" => token::Tok::DoubleStar,
"&" => lexer::Tok::Amper, "&" => token::Tok::Amper,
"@" => lexer::Tok::At, "@" => token::Tok::At,
"%" => lexer::Tok::Percent, "%" => token::Tok::Percent,
"//" => lexer::Tok::DoubleSlash, "//" => token::Tok::DoubleSlash,
"^" => lexer::Tok::CircumFlex, "^" => token::Tok::CircumFlex,
"|" => lexer::Tok::Vbar, "|" => token::Tok::Vbar,
"<<" => lexer::Tok::LeftShift, "<<" => token::Tok::LeftShift,
">>" => lexer::Tok::RightShift, ">>" => token::Tok::RightShift,
"/" => lexer::Tok::Slash, "/" => token::Tok::Slash,
"(" => lexer::Tok::Lpar, "(" => token::Tok::Lpar,
")" => lexer::Tok::Rpar, ")" => token::Tok::Rpar,
"[" => lexer::Tok::Lsqb, "[" => token::Tok::Lsqb,
"]" => lexer::Tok::Rsqb, "]" => token::Tok::Rsqb,
"{" => lexer::Tok::Lbrace, "{" => token::Tok::Lbrace,
"}" => lexer::Tok::Rbrace, "}" => token::Tok::Rbrace,
"=" => lexer::Tok::Equal, "=" => token::Tok::Equal,
"+=" => lexer::Tok::PlusEqual, "+=" => token::Tok::PlusEqual,
"-=" => lexer::Tok::MinusEqual, "-=" => token::Tok::MinusEqual,
"*=" => lexer::Tok::StarEqual, "*=" => token::Tok::StarEqual,
"@=" => lexer::Tok::AtEqual, "@=" => token::Tok::AtEqual,
"/=" => lexer::Tok::SlashEqual, "/=" => token::Tok::SlashEqual,
"%=" => lexer::Tok::PercentEqual, "%=" => token::Tok::PercentEqual,
"&=" => lexer::Tok::AmperEqual, "&=" => token::Tok::AmperEqual,
"|=" => lexer::Tok::VbarEqual, "|=" => token::Tok::VbarEqual,
"^=" => lexer::Tok::CircumflexEqual, "^=" => token::Tok::CircumflexEqual,
"<<=" => lexer::Tok::LeftShiftEqual, "<<=" => token::Tok::LeftShiftEqual,
">>=" => lexer::Tok::RightShiftEqual, ">>=" => token::Tok::RightShiftEqual,
"**=" => lexer::Tok::DoubleStarEqual, "**=" => token::Tok::DoubleStarEqual,
"//=" => lexer::Tok::DoubleSlashEqual, "//=" => token::Tok::DoubleSlashEqual,
":=" => lexer::Tok::ColonEqual, ":=" => token::Tok::ColonEqual,
"==" => lexer::Tok::EqEqual, "==" => token::Tok::EqEqual,
"!=" => lexer::Tok::NotEqual, "!=" => token::Tok::NotEqual,
"<" => lexer::Tok::Less, "<" => token::Tok::Less,
"<=" => lexer::Tok::LessEqual, "<=" => token::Tok::LessEqual,
">" => lexer::Tok::Greater, ">" => token::Tok::Greater,
">=" => lexer::Tok::GreaterEqual, ">=" => token::Tok::GreaterEqual,
"->" => lexer::Tok::Rarrow, "->" => token::Tok::Rarrow,
"and" => lexer::Tok::And, "and" => token::Tok::And,
"as" => lexer::Tok::As, "as" => token::Tok::As,
"assert" => lexer::Tok::Assert, "assert" => token::Tok::Assert,
"async" => lexer::Tok::Async, "async" => token::Tok::Async,
"await" => lexer::Tok::Await, "await" => token::Tok::Await,
"break" => lexer::Tok::Break, "break" => token::Tok::Break,
"class" => lexer::Tok::Class, "class" => token::Tok::Class,
"continue" => lexer::Tok::Continue, "continue" => token::Tok::Continue,
"def" => lexer::Tok::Def, "def" => token::Tok::Def,
"del" => lexer::Tok::Del, "del" => token::Tok::Del,
"elif" => lexer::Tok::Elif, "elif" => token::Tok::Elif,
"else" => lexer::Tok::Else, "else" => token::Tok::Else,
"except" => lexer::Tok::Except, "except" => token::Tok::Except,
"finally" => lexer::Tok::Finally, "finally" => token::Tok::Finally,
"for" => lexer::Tok::For, "for" => token::Tok::For,
"from" => lexer::Tok::From, "from" => token::Tok::From,
"global" => lexer::Tok::Global, "global" => token::Tok::Global,
"if" => lexer::Tok::If, "if" => token::Tok::If,
"import" => lexer::Tok::Import, "import" => token::Tok::Import,
"in" => lexer::Tok::In, "in" => token::Tok::In,
"is" => lexer::Tok::Is, "is" => token::Tok::Is,
"lambda" => lexer::Tok::Lambda, "lambda" => token::Tok::Lambda,
"nonlocal" => lexer::Tok::Nonlocal, "nonlocal" => token::Tok::Nonlocal,
"not" => lexer::Tok::Not, "not" => token::Tok::Not,
"or" => lexer::Tok::Or, "or" => token::Tok::Or,
"pass" => lexer::Tok::Pass, "pass" => token::Tok::Pass,
"raise" => lexer::Tok::Raise, "raise" => token::Tok::Raise,
"return" => lexer::Tok::Return, "return" => token::Tok::Return,
"try" => lexer::Tok::Try, "try" => token::Tok::Try,
"while" => lexer::Tok::While, "while" => token::Tok::While,
"match" => lexer::Tok::Match, "match" => token::Tok::Match,
"case" => lexer::Tok::Case, "case" => token::Tok::Case,
"with" => lexer::Tok::With, "with" => token::Tok::With,
"yield" => lexer::Tok::Yield, "yield" => token::Tok::Yield,
"True" => lexer::Tok::True, "True" => token::Tok::True,
"False" => lexer::Tok::False, "False" => token::Tok::False,
"None" => lexer::Tok::None, "None" => token::Tok::None,
int => lexer::Tok::Int { value: <BigInt> }, int => token::Tok::Int { value: <BigInt> },
float => lexer::Tok::Float { value: <f64> }, float => token::Tok::Float { value: <f64> },
complex => lexer::Tok::Complex { real: <f64>, imag: <f64> }, complex => token::Tok::Complex { real: <f64>, imag: <f64> },
string => lexer::Tok::String { string => token::Tok::String {
value: <String>, value: <String>,
kind: <StringKind>, kind: <StringKind>,
triple_quoted: <bool> triple_quoted: <bool>
}, },
name => lexer::Tok::Name { name: <String> }, name => token::Tok::Name { name: <String> },
"\n" => lexer::Tok::Newline, "\n" => token::Tok::Newline,
";" => lexer::Tok::Semi, ";" => token::Tok::Semi,
"#" => lexer::Tok::Comment(_), "#" => token::Tok::Comment(_),
} }
} }

View file

@ -1,331 +0,0 @@
//! Error types for the parser.
//!
//! These types are used to represent errors that occur during lexing and parsing and are
//! returned by the `parse_*` functions in the [parser] module and the iterator in the
//! [lexer] implementation.
//!
//! [parser]: crate::parser
//! [lexer]: crate::lexer
// Define internal parse error types.
// The goal is to provide a matching and a safe error API, masking errors from LALR
use crate::{ast::Location, token::Tok};
use lalrpop_util::ParseError as LalrpopError;
use std::fmt;
/// Represents an error during lexing.
#[derive(Debug, PartialEq)]
pub struct LexicalError {
/// The type of error that occurred.
pub error: LexicalErrorType,
/// The location of the error.
pub location: Location,
}
impl LexicalError {
/// Creates a new `LexicalError` with the given error type and location.
pub fn new(error: LexicalErrorType, location: Location) -> Self {
Self { error, location }
}
}
/// Represents the different types of errors that can occur during lexing.
#[derive(Debug, PartialEq)]
pub enum LexicalErrorType {
// TODO: Can probably be removed, the places it is used seem to be able
// to use the `UnicodeError` variant instead.
#[doc(hidden)]
StringError,
// TODO: Should take a start/end position to report.
/// Decoding of a unicode escape sequence in a string literal failed.
UnicodeError,
/// The nesting of brackets/braces/parentheses is not balanced.
NestingError,
/// The indentation is not consistent.
IndentationError,
/// Inconsistent use of tabs and spaces.
TabError,
/// Encountered a tab after a space.
TabsAfterSpaces,
/// A non-default argument follows a default argument.
DefaultArgumentError,
/// A duplicate argument was found in a function definition.
DuplicateArgumentError(String),
/// A positional argument follows a keyword argument.
PositionalArgumentError,
/// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
UnpackedArgumentError,
/// A keyword argument was repeated.
DuplicateKeywordArgumentError(String),
/// An unrecognized token was encountered.
UnrecognizedToken { tok: char },
/// An f-string error containing the [`FStringErrorType`].
FStringError(FStringErrorType),
/// An unexpected character was encountered after a line continuation.
LineContinuationError,
/// An unexpected end of file was encountered.
Eof,
/// An unexpected error occurred.
OtherError(String),
}
impl fmt::Display for LexicalErrorType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
LexicalErrorType::StringError => write!(f, "Got unexpected string"),
LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
LexicalErrorType::NestingError => write!(f, "Got unexpected nesting"),
LexicalErrorType::IndentationError => {
write!(f, "unindent does not match any outer indentation level")
}
LexicalErrorType::TabError => {
write!(f, "inconsistent use of tabs and spaces in indentation")
}
LexicalErrorType::TabsAfterSpaces => {
write!(f, "Tabs not allowed as part of indentation after spaces")
}
LexicalErrorType::DefaultArgumentError => {
write!(f, "non-default argument follows default argument")
}
LexicalErrorType::DuplicateArgumentError(arg_name) => {
write!(f, "duplicate argument '{arg_name}' in function definition")
}
LexicalErrorType::DuplicateKeywordArgumentError(arg_name) => {
write!(f, "keyword argument repeated: {arg_name}")
}
LexicalErrorType::PositionalArgumentError => {
write!(f, "positional argument follows keyword argument")
}
LexicalErrorType::UnpackedArgumentError => {
write!(
f,
"iterable argument unpacking follows keyword argument unpacking"
)
}
LexicalErrorType::UnrecognizedToken { tok } => {
write!(f, "Got unexpected token {tok}")
}
LexicalErrorType::LineContinuationError => {
write!(f, "unexpected character after line continuation character")
}
LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
}
}
}
// TODO: consolidate these with ParseError
/// An error that occurred during parsing of an f-string.
#[derive(Debug, PartialEq)]
pub struct FStringError {
/// The type of error that occurred.
pub error: FStringErrorType,
/// The location of the error.
pub location: Location,
}
impl FStringError {
/// Creates a new `FStringError` with the given error type and location.
pub fn new(error: FStringErrorType, location: Location) -> Self {
Self { error, location }
}
}
impl From<FStringError> for LexicalError {
fn from(err: FStringError) -> Self {
LexicalError {
error: LexicalErrorType::FStringError(err.error),
location: err.location,
}
}
}
/// Represents the different types of errors that can occur during parsing of an f-string.
#[derive(Debug, PartialEq)]
pub enum FStringErrorType {
/// Expected a right brace after an opened left brace.
UnclosedLbrace,
/// Expected a left brace after an ending right brace.
UnopenedRbrace,
/// Expected a right brace after a conversion flag.
ExpectedRbrace,
/// An error occurred while parsing an f-string expression.
InvalidExpression(Box<ParseErrorType>),
/// An invalid conversion flag was encountered.
InvalidConversionFlag,
/// An empty expression was encountered.
EmptyExpression,
/// An opening delimiter was not closed properly.
MismatchedDelimiter(char, char),
/// Too many nested expressions in an f-string.
ExpressionNestedTooDeeply,
/// The f-string expression cannot include the given character.
ExpressionCannotInclude(char),
/// A single right brace was encountered.
SingleRbrace,
/// A closing delimiter was not opened properly.
Unmatched(char),
// TODO: Test this case.
/// Unterminated string.
UnterminatedString,
}
impl fmt::Display for FStringErrorType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
FStringErrorType::UnclosedLbrace => write!(f, "expecting '}}'"),
FStringErrorType::UnopenedRbrace => write!(f, "Unopened '}}'"),
FStringErrorType::ExpectedRbrace => write!(f, "Expected '}}' after conversion flag."),
FStringErrorType::InvalidExpression(error) => {
write!(f, "{error}")
}
FStringErrorType::InvalidConversionFlag => write!(f, "invalid conversion character"),
FStringErrorType::EmptyExpression => write!(f, "empty expression not allowed"),
FStringErrorType::MismatchedDelimiter(first, second) => write!(
f,
"closing parenthesis '{second}' does not match opening parenthesis '{first}'"
),
FStringErrorType::SingleRbrace => write!(f, "single '}}' is not allowed"),
FStringErrorType::Unmatched(delim) => write!(f, "unmatched '{delim}'"),
FStringErrorType::ExpressionNestedTooDeeply => {
write!(f, "expressions nested too deeply")
}
FStringErrorType::UnterminatedString => {
write!(f, "unterminated string")
}
FStringErrorType::ExpressionCannotInclude(c) => {
if *c == '\\' {
write!(f, "f-string expression part cannot include a backslash")
} else {
write!(f, "f-string expression part cannot include '{c}'s")
}
}
}
}
}
impl From<FStringError> for LalrpopError<Location, Tok, LexicalError> {
fn from(err: FStringError) -> Self {
lalrpop_util::ParseError::User {
error: LexicalError {
error: LexicalErrorType::FStringError(err.error),
location: err.location,
},
}
}
}
/// Represents an error during parsing.
pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
/// Represents the different types of errors that can occur during parsing.
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum ParseErrorType {
/// Parser encountered an unexpected end of input
Eof,
/// Parser encountered an extra token
ExtraToken(Tok),
/// Parser encountered an invalid token
InvalidToken,
/// Parser encountered an unexpected token
UnrecognizedToken(Tok, Option<String>),
// Maps to `User` type from `lalrpop-util`
/// Parser encountered an error during lexing.
Lexical(LexicalErrorType),
}
// Convert `lalrpop_util::ParseError` to our internal type
pub(crate) fn parse_error_from_lalrpop(
err: LalrpopError<Location, Tok, LexicalError>,
source_path: &str,
) -> ParseError {
let source_path = source_path.to_owned();
match err {
// TODO: Are there cases where this isn't an EOF?
LalrpopError::InvalidToken { location } => ParseError {
error: ParseErrorType::Eof,
location,
source_path,
},
LalrpopError::ExtraToken { token } => ParseError {
error: ParseErrorType::ExtraToken(token.1),
location: token.0,
source_path,
},
LalrpopError::User { error } => ParseError {
error: ParseErrorType::Lexical(error.error),
location: error.location,
source_path,
},
LalrpopError::UnrecognizedToken { token, expected } => {
// Hacky, but it's how CPython does it. See PyParser_AddToken,
// in particular "Only one possible expected token" comment.
let expected = (expected.len() == 1).then(|| expected[0].clone());
ParseError {
error: ParseErrorType::UnrecognizedToken(token.1, expected),
location: token.0.with_col_offset(1),
source_path,
}
}
LalrpopError::UnrecognizedEOF { location, expected } => {
// This could be an initial indentation error that we should ignore
let indent_error = expected == ["Indent"];
if indent_error {
ParseError {
error: ParseErrorType::Lexical(LexicalErrorType::IndentationError),
location,
source_path,
}
} else {
ParseError {
error: ParseErrorType::Eof,
location,
source_path,
}
}
}
}
}
impl fmt::Display for ParseErrorType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
ParseErrorType::Eof => write!(f, "Got unexpected EOF"),
ParseErrorType::ExtraToken(ref tok) => write!(f, "Got extraneous token: {tok:?}"),
ParseErrorType::InvalidToken => write!(f, "Got invalid token"),
ParseErrorType::UnrecognizedToken(ref tok, ref expected) => {
if *tok == Tok::Indent {
write!(f, "unexpected indent")
} else if expected.as_deref() == Some("Indent") {
write!(f, "expected an indented block")
} else {
write!(f, "invalid syntax. Got unexpected token {tok}")
}
}
ParseErrorType::Lexical(ref error) => write!(f, "{error}"),
}
}
}
impl ParseErrorType {
/// Returns true if the error is an indentation error.
pub fn is_indentation_error(&self) -> bool {
match self {
ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
ParseErrorType::UnrecognizedToken(token, expected) => {
*token == Tok::Indent || expected.clone() == Some("Indent".to_owned())
}
_ => false,
}
}
/// Returns true if the error is a tab error.
pub fn is_tab_error(&self) -> bool {
matches!(
self,
ParseErrorType::Lexical(LexicalErrorType::TabError)
| ParseErrorType::Lexical(LexicalErrorType::TabsAfterSpaces)
)
}
}

View file

@ -1,7 +1,9 @@
// Contains functions that perform validation and parsing of arguments and parameters. // Contains functions that perform validation and parsing of arguments and parameters.
// Checks apply both to functions and to lambdas. // Checks apply both to functions and to lambdas.
use crate::ast; use crate::{
use crate::error::{LexicalError, LexicalErrorType}; ast,
lexer::{LexicalError, LexicalErrorType},
};
use rustc_hash::FxHashSet; use rustc_hash::FxHashSet;
pub(crate) struct ArgumentList { pub(crate) struct ArgumentList {
@ -149,8 +151,8 @@ fn is_starred(exp: &ast::Expr) -> bool {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::error::{LexicalErrorType, ParseErrorType}; use super::*;
use crate::parser::parse_program; use crate::parser::{parse_program, ParseErrorType};
macro_rules! function_and_lambda { macro_rules! function_and_lambda {
($($name:ident: $code:expr,)*) => { ($($name:ident: $code:expr,)*) => {

View file

@ -4,7 +4,7 @@
//! governing what is and is not a valid token are defined in the Python reference //! governing what is and is not a valid token are defined in the Python reference
//! guide section on [Lexical analysis]. //! guide section on [Lexical analysis].
//! //!
//! The primary function in this module is [`make_tokenizer`], which takes a string slice //! The primary function in this module is [`lex`], which takes a string slice
//! and returns an iterator over the tokens in the source code. The tokens are currently returned //! and returns an iterator over the tokens in the source code. The tokens are currently returned
//! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the //! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the
//! start and end [`Location`] and a [`Tok`] denoting the token. //! start and end [`Location`] and a [`Tok`] denoting the token.
@ -12,12 +12,10 @@
//! # Example //! # Example
//! //!
//! ``` //! ```
//! use rustpython_parser::lexer::{make_tokenizer, Tok}; //! use rustpython_parser::{lexer::lex, Tok, Mode, StringKind};
//! use rustpython_parser::mode::Mode;
//! use rustpython_parser::token::StringKind;
//! //!
//! let source = "x = 'RustPython'"; //! let source = "x = 'RustPython'";
//! let tokens = make_tokenizer(source, Mode::Module) //! let tokens = lex(source, Mode::Module)
//! .map(|tok| tok.expect("Failed to lex")) //! .map(|tok| tok.expect("Failed to lex"))
//! .collect::<Vec<_>>(); //! .collect::<Vec<_>>();
//! //!
@ -33,19 +31,17 @@
//! ``` //! ```
//! //!
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
pub use super::token::{StringKind, Tok}; use crate::{
use crate::ast::Location; ast::Location,
use crate::error::{LexicalError, LexicalErrorType}; mode::Mode,
use crate::mode::Mode; soft_keywords::SoftKeywordTransformer,
use crate::soft_keywords::SoftKeywordTransformer; string::FStringErrorType,
token::{StringKind, Tok},
};
use log::trace;
use num_bigint::BigInt; use num_bigint::BigInt;
use num_traits::identities::Zero; use num_traits::{Num, Zero};
use num_traits::Num; use std::{char, cmp::Ordering, ops::Index, slice::SliceIndex, str::FromStr};
use std::char;
use std::cmp::Ordering;
use std::ops::Index;
use std::slice::SliceIndex;
use std::str::FromStr;
use unic_emoji_char::is_emoji_presentation; use unic_emoji_char::is_emoji_presentation;
use unic_ucd_ident::{is_xid_continue, is_xid_start}; use unic_ucd_ident::{is_xid_continue, is_xid_start};
@ -195,29 +191,28 @@ pub type Spanned = (Location, Tok, Location);
/// The result of lexing a token. /// The result of lexing a token.
pub type LexResult = Result<Spanned, LexicalError>; pub type LexResult = Result<Spanned, LexicalError>;
/// Create a new tokenizer from a source string. /// Create a new lexer from a source string.
/// ///
/// # Examples /// # Examples
/// ///
/// ``` /// ```
/// use rustpython_parser::mode::Mode; /// use rustpython_parser::{Mode, lexer::lex};
/// use rustpython_parser::lexer::{make_tokenizer};
/// ///
/// let source = "def hello(): return 'world'"; /// let source = "def hello(): return 'world'";
/// let tokenizer = make_tokenizer(source, Mode::Module); /// let lexer = lex(source, Mode::Module);
/// ///
/// for token in tokenizer { /// for token in lexer {
/// println!("{:?}", token); /// println!("{:?}", token);
/// } /// }
/// ``` /// ```
#[inline] #[inline]
pub fn make_tokenizer(source: &str, mode: Mode) -> impl Iterator<Item = LexResult> + '_ { pub fn lex(source: &str, mode: Mode) -> impl Iterator<Item = LexResult> + '_ {
make_tokenizer_located(source, mode, Location::default()) lex_located(source, mode, Location::default())
} }
/// Create a new tokenizer from a source string, starting at a given location. /// Create a new lexer from a source string, starting at a given location.
/// You probably want to use [`make_tokenizer`] instead. /// You probably want to use [`lex`] instead.
pub fn make_tokenizer_located( pub fn lex_located(
source: &str, source: &str,
mode: Mode, mode: Mode,
start_location: Location, start_location: Location,
@ -230,7 +225,7 @@ where
T: Iterator<Item = char>, T: Iterator<Item = char>,
{ {
/// Create a new lexer from T and a starting location. You probably want to use /// Create a new lexer from T and a starting location. You probably want to use
/// [`make_tokenizer`] instead. /// [`lex`] instead.
pub fn new(input: T, start: Location) -> Self { pub fn new(input: T, start: Location) -> Self {
let mut lxr = Lexer { let mut lxr = Lexer {
at_begin_of_line: true, at_begin_of_line: true,
@ -1212,10 +1207,115 @@ where
} }
} }
/// Represents an error that occur during lexing and are
/// returned by the `parse_*` functions in the iterator in the
/// [lexer] implementation.
///
/// [lexer]: crate::lexer
#[derive(Debug, PartialEq)]
pub struct LexicalError {
/// The type of error that occurred.
pub error: LexicalErrorType,
/// The location of the error.
pub location: Location,
}
impl LexicalError {
/// Creates a new `LexicalError` with the given error type and location.
pub fn new(error: LexicalErrorType, location: Location) -> Self {
Self { error, location }
}
}
/// Represents the different types of errors that can occur during lexing.
#[derive(Debug, PartialEq)]
pub enum LexicalErrorType {
// TODO: Can probably be removed, the places it is used seem to be able
// to use the `UnicodeError` variant instead.
#[doc(hidden)]
StringError,
// TODO: Should take a start/end position to report.
/// Decoding of a unicode escape sequence in a string literal failed.
UnicodeError,
/// The nesting of brackets/braces/parentheses is not balanced.
NestingError,
/// The indentation is not consistent.
IndentationError,
/// Inconsistent use of tabs and spaces.
TabError,
/// Encountered a tab after a space.
TabsAfterSpaces,
/// A non-default argument follows a default argument.
DefaultArgumentError,
/// A duplicate argument was found in a function definition.
DuplicateArgumentError(String),
/// A positional argument follows a keyword argument.
PositionalArgumentError,
/// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
UnpackedArgumentError,
/// A keyword argument was repeated.
DuplicateKeywordArgumentError(String),
/// An unrecognized token was encountered.
UnrecognizedToken { tok: char },
/// An f-string error containing the [`FStringErrorType`].
FStringError(FStringErrorType),
/// An unexpected character was encountered after a line continuation.
LineContinuationError,
/// An unexpected end of file was encountered.
Eof,
/// An unexpected error occurred.
OtherError(String),
}
impl std::fmt::Display for LexicalErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
LexicalErrorType::StringError => write!(f, "Got unexpected string"),
LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
LexicalErrorType::NestingError => write!(f, "Got unexpected nesting"),
LexicalErrorType::IndentationError => {
write!(f, "unindent does not match any outer indentation level")
}
LexicalErrorType::TabError => {
write!(f, "inconsistent use of tabs and spaces in indentation")
}
LexicalErrorType::TabsAfterSpaces => {
write!(f, "Tabs not allowed as part of indentation after spaces")
}
LexicalErrorType::DefaultArgumentError => {
write!(f, "non-default argument follows default argument")
}
LexicalErrorType::DuplicateArgumentError(arg_name) => {
write!(f, "duplicate argument '{arg_name}' in function definition")
}
LexicalErrorType::DuplicateKeywordArgumentError(arg_name) => {
write!(f, "keyword argument repeated: {arg_name}")
}
LexicalErrorType::PositionalArgumentError => {
write!(f, "positional argument follows keyword argument")
}
LexicalErrorType::UnpackedArgumentError => {
write!(
f,
"iterable argument unpacking follows keyword argument unpacking"
)
}
LexicalErrorType::UnrecognizedToken { tok } => {
write!(f, "Got unexpected token {tok}")
}
LexicalErrorType::LineContinuationError => {
write!(f, "unexpected character after line continuation character")
}
LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
}
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{make_tokenizer, StringKind, Tok}; use super::*;
use crate::mode::Mode;
use num_bigint::BigInt; use num_bigint::BigInt;
const WINDOWS_EOL: &str = "\r\n"; const WINDOWS_EOL: &str = "\r\n";
@ -1223,7 +1323,7 @@ mod tests {
const UNIX_EOL: &str = "\n"; const UNIX_EOL: &str = "\n";
pub fn lex_source(source: &str) -> Vec<Tok> { pub fn lex_source(source: &str) -> Vec<Tok> {
let lexer = make_tokenizer(source, Mode::Module); let lexer = lex(source, Mode::Module);
lexer.map(|x| x.unwrap().1).collect() lexer.map(|x| x.unwrap().1).collect()
} }

View file

@ -54,41 +54,37 @@
//! //!
//! The functionality of this crate is split into several modules: //! The functionality of this crate is split into several modules:
//! //!
//! - [token]: This module contains the definition of the tokens that are generated by the lexer. //! - token: This module contains the definition of the tokens that are generated by the lexer.
//! - [lexer]: This module contains the lexer and is responsible for generating the tokens. //! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
//! - [parser]: This module contains an interface to the parser and is responsible for generating the AST. //! - parser: This module contains an interface to the parser and is responsible for generating the AST.
//! - Functions and strings have special parsing requirements that are handled in additional files. //! - Functions and strings have special parsing requirements that are handled in additional files.
//! - [mode]: This module contains the definition of the different modes that the parser can be in. //! - mode: This module contains the definition of the different modes that the parser can be in.
//! - [error]: This module contains the definition of the errors that can be returned by the parser.
//! //!
//! # Examples //! # Examples
//! //!
//! For example, to get a stream of tokens from a given string, one could do this: //! For example, to get a stream of tokens from a given string, one could do this:
//! //!
//! ``` //! ```
//! use rustpython_parser::mode::Mode; //! use rustpython_parser::{lexer::lex, Mode};
//! use rustpython_parser::lexer::make_tokenizer;
//! //!
//! let python_source = r#" //! let python_source = r#"
//! def is_odd(i): //! def is_odd(i):
//! return bool(i & 1) //! return bool(i & 1)
//! "#; //! "#;
//! let mut tokens = make_tokenizer(python_source, Mode::Module); //! let mut tokens = lex(python_source, Mode::Module);
//! assert!(tokens.all(|t| t.is_ok())); //! assert!(tokens.all(|t| t.is_ok()));
//! ``` //! ```
//! //!
//! These tokens can be directly fed into the parser to generate an AST: //! These tokens can be directly fed into the parser to generate an AST:
//! //!
//! ``` //! ```
//! use rustpython_parser::lexer::make_tokenizer; //! use rustpython_parser::{lexer::lex, Mode, parse_tokens};
//! use rustpython_parser::mode::Mode;
//! use rustpython_parser::parser::parse_tokens;
//! //!
//! let python_source = r#" //! let python_source = r#"
//! def is_odd(i): //! def is_odd(i):
//! return bool(i & 1) //! return bool(i & 1)
//! "#; //! "#;
//! let tokens = make_tokenizer(python_source, Mode::Module); //! let tokens = lex(python_source, Mode::Module);
//! let ast = parse_tokens(tokens, Mode::Module, "<embedded>"); //! let ast = parse_tokens(tokens, Mode::Module, "<embedded>");
//! //!
//! assert!(ast.is_ok()); //! assert!(ast.is_ok());
@ -98,7 +94,7 @@
//! mode or tokenizing the source beforehand: //! mode or tokenizing the source beforehand:
//! //!
//! ``` //! ```
//! use rustpython_parser::parser::parse_program; //! use rustpython_parser::parse_program;
//! //!
//! let python_source = r#" //! let python_source = r#"
//! def is_odd(i): //! def is_odd(i):
@ -111,27 +107,29 @@
//! //!
//! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis //! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
//! [parsing]: https://en.wikipedia.org/wiki/Parsing //! [parsing]: https://en.wikipedia.org/wiki/Parsing
//! [token]: crate::token
//! [lexer]: crate::lexer //! [lexer]: crate::lexer
//! [parser]: crate::parser
//! [mode]: crate::mode
//! [error]: crate::error
#![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")] #![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")]
#![doc(html_root_url = "https://docs.rs/rustpython-parser/")] #![doc(html_root_url = "https://docs.rs/rustpython-parser/")]
#[macro_use]
extern crate log;
pub use rustpython_ast as ast; pub use rustpython_ast as ast;
pub mod error;
mod function; mod function;
// Skip flattening lexer to distinguish from full parser
pub mod lexer; pub mod lexer;
pub mod mode; mod mode;
pub mod parser; mod parser;
mod string; mod string;
#[rustfmt::skip] #[rustfmt::skip]
mod python; mod python;
mod context; mod context;
mod soft_keywords; mod soft_keywords;
pub mod token; mod token;
pub use mode::Mode;
pub use parser::{
parse, parse_expression, parse_expression_located, parse_located, parse_program, parse_tokens,
ParseError, ParseErrorType,
};
pub use string::FStringErrorType;
pub use token::{StringKind, Tok};

View file

@ -39,19 +39,17 @@ impl std::str::FromStr for Mode {
match s { match s {
"exec" | "single" => Ok(Mode::Module), "exec" | "single" => Ok(Mode::Module),
"eval" => Ok(Mode::Expression), "eval" => Ok(Mode::Expression),
_ => Err(ModeParseError { _priv: () }), _ => Err(ModeParseError(())),
} }
} }
} }
/// Returned when a given mode is not valid. /// Returned when a given mode is not valid.
#[derive(Debug)] #[derive(Debug)]
pub struct ModeParseError { pub struct ModeParseError(());
_priv: (),
}
impl std::fmt::Display for ModeParseError { impl std::fmt::Display for ModeParseError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, r#"mode should be "exec", "eval", or "single""#) write!(f, r#"mode must be "exec", "eval", or "single""#)
} }
} }

View file

@ -12,13 +12,18 @@
//! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree //! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree
//! [`Mode`]: crate::mode //! [`Mode`]: crate::mode
use crate::lexer::{LexResult, Tok}; use crate::{
pub use crate::mode::Mode; ast::{self, Location},
use crate::{ast, error::ParseError, lexer, python}; lexer::{self, LexResult, LexicalError, LexicalErrorType},
use ast::Location; mode::Mode,
python,
token::Tok,
};
use itertools::Itertools; use itertools::Itertools;
use std::iter; use std::iter;
pub(super) use lalrpop_util::ParseError as LalrpopError;
/// Parse a full Python program usually consisting of multiple lines. /// Parse a full Python program usually consisting of multiple lines.
/// ///
/// This is a convenience function that can be used to parse a full Python program without having to /// This is a convenience function that can be used to parse a full Python program without having to
@ -29,7 +34,7 @@ use std::iter;
/// For example, parsing a simple function definition and a call to that function: /// For example, parsing a simple function definition and a call to that function:
/// ///
/// ``` /// ```
/// use rustpython_parser::parser; /// use rustpython_parser as parser;
/// let source = r#" /// let source = r#"
/// def foo(): /// def foo():
/// return 42 /// return 42
@ -57,7 +62,7 @@ pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, Pars
/// ///
/// ``` /// ```
/// extern crate num_bigint; /// extern crate num_bigint;
/// use rustpython_parser::{parser, ast}; /// use rustpython_parser as parser;
/// let expr = parser::parse_expression("1 + 2", "<embedded>"); /// let expr = parser::parse_expression("1 + 2", "<embedded>");
/// ///
/// assert!(expr.is_ok()); /// assert!(expr.is_ok());
@ -78,8 +83,7 @@ pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseErro
/// somewhat silly, location: /// somewhat silly, location:
/// ///
/// ``` /// ```
/// use rustpython_parser::parser::parse_expression_located; /// use rustpython_parser::{ast::Location, parse_expression_located};
/// use rustpython_parser::ast::Location;
/// ///
/// let expr = parse_expression_located("1 + 2", "<embedded>", Location::new(5, 20)); /// let expr = parse_expression_located("1 + 2", "<embedded>", Location::new(5, 20));
/// assert!(expr.is_ok()); /// assert!(expr.is_ok());
@ -106,8 +110,7 @@ pub fn parse_expression_located(
/// parsing: /// parsing:
/// ///
/// ``` /// ```
/// use rustpython_parser::mode::Mode; /// use rustpython_parser::{Mode, parse};
/// use rustpython_parser::parser::parse;
/// ///
/// let expr = parse("1 + 2", Mode::Expression, "<embedded>"); /// let expr = parse("1 + 2", Mode::Expression, "<embedded>");
/// assert!(expr.is_ok()); /// assert!(expr.is_ok());
@ -116,8 +119,7 @@ pub fn parse_expression_located(
/// Alternatively, we can parse a full Python program consisting of multiple lines: /// Alternatively, we can parse a full Python program consisting of multiple lines:
/// ///
/// ``` /// ```
/// use rustpython_parser::mode::Mode; /// use rustpython_parser::{Mode, parse};
/// use rustpython_parser::parser::parse;
/// ///
/// let source = r#" /// let source = r#"
/// class Greeter: /// class Greeter:
@ -140,9 +142,7 @@ pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, Pa
/// # Example /// # Example
/// ///
/// ``` /// ```
/// use rustpython_parser::ast::Location; /// use rustpython_parser::{ast::Location, Mode, parse_located};
/// use rustpython_parser::mode::Mode;
/// use rustpython_parser::parser::parse_located;
/// ///
/// let source = r#" /// let source = r#"
/// def fib(i): /// def fib(i):
@ -162,7 +162,7 @@ pub fn parse_located(
source_path: &str, source_path: &str,
location: Location, location: Location,
) -> Result<ast::Mod, ParseError> { ) -> Result<ast::Mod, ParseError> {
let lxr = lexer::make_tokenizer_located(source, mode, location); let lxr = lexer::lex_located(source, mode, location);
parse_tokens(lxr, mode, source_path) parse_tokens(lxr, mode, source_path)
} }
@ -173,14 +173,12 @@ pub fn parse_located(
/// # Example /// # Example
/// ///
/// As an example, instead of parsing a string, we can parse a list of tokens after we generate /// As an example, instead of parsing a string, we can parse a list of tokens after we generate
/// them using the [`lexer::make_tokenizer`] function: /// them using the [`lexer::lex`] function:
/// ///
/// ``` /// ```
/// use rustpython_parser::lexer::make_tokenizer; /// use rustpython_parser::{lexer::lex, Mode, parse_tokens};
/// use rustpython_parser::mode::Mode;
/// use rustpython_parser::parser::parse_tokens;
/// ///
/// let expr = parse_tokens(make_tokenizer("1 + 2", Mode::Expression), Mode::Expression, "<embedded>"); /// let expr = parse_tokens(lex("1 + 2", Mode::Expression), Mode::Expression, "<embedded>");
/// assert!(expr.is_ok()); /// assert!(expr.is_ok());
/// ``` /// ```
pub fn parse_tokens( pub fn parse_tokens(
@ -189,12 +187,127 @@ pub fn parse_tokens(
source_path: &str, source_path: &str,
) -> Result<ast::Mod, ParseError> { ) -> Result<ast::Mod, ParseError> {
let marker_token = (Default::default(), mode.to_marker(), Default::default()); let marker_token = (Default::default(), mode.to_marker(), Default::default());
let tokenizer = iter::once(Ok(marker_token)) let lexer = iter::once(Ok(marker_token))
.chain(lxr) .chain(lxr)
.filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
python::TopParser::new() python::TopParser::new()
.parse(tokenizer.into_iter()) .parse(lexer.into_iter())
.map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path)) .map_err(|e| parse_error_from_lalrpop(e, source_path))
}
/// Represents represent errors that occur during parsing and are
/// returned by the `parse_*` functions.
pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
/// Represents the different types of errors that can occur during parsing.
#[derive(Debug, PartialEq, thiserror::Error)]
pub enum ParseErrorType {
/// Parser encountered an unexpected end of input
Eof,
/// Parser encountered an extra token
ExtraToken(Tok),
/// Parser encountered an invalid token
InvalidToken,
/// Parser encountered an unexpected token
UnrecognizedToken(Tok, Option<String>),
// Maps to `User` type from `lalrpop-util`
/// Parser encountered an error during lexing.
Lexical(LexicalErrorType),
}
// Convert `lalrpop_util::ParseError` to our internal type
fn parse_error_from_lalrpop(
err: LalrpopError<Location, Tok, LexicalError>,
source_path: &str,
) -> ParseError {
let source_path = source_path.to_owned();
match err {
// TODO: Are there cases where this isn't an EOF?
LalrpopError::InvalidToken { location } => ParseError {
error: ParseErrorType::Eof,
location,
source_path,
},
LalrpopError::ExtraToken { token } => ParseError {
error: ParseErrorType::ExtraToken(token.1),
location: token.0,
source_path,
},
LalrpopError::User { error } => ParseError {
error: ParseErrorType::Lexical(error.error),
location: error.location,
source_path,
},
LalrpopError::UnrecognizedToken { token, expected } => {
// Hacky, but it's how CPython does it. See PyParser_AddToken,
// in particular "Only one possible expected token" comment.
let expected = (expected.len() == 1).then(|| expected[0].clone());
ParseError {
error: ParseErrorType::UnrecognizedToken(token.1, expected),
location: token.0.with_col_offset(1),
source_path,
}
}
LalrpopError::UnrecognizedEOF { location, expected } => {
// This could be an initial indentation error that we should ignore
let indent_error = expected == ["Indent"];
if indent_error {
ParseError {
error: ParseErrorType::Lexical(LexicalErrorType::IndentationError),
location,
source_path,
}
} else {
ParseError {
error: ParseErrorType::Eof,
location,
source_path,
}
}
}
}
}
impl std::fmt::Display for ParseErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match *self {
ParseErrorType::Eof => write!(f, "Got unexpected EOF"),
ParseErrorType::ExtraToken(ref tok) => write!(f, "Got extraneous token: {tok:?}"),
ParseErrorType::InvalidToken => write!(f, "Got invalid token"),
ParseErrorType::UnrecognizedToken(ref tok, ref expected) => {
if *tok == Tok::Indent {
write!(f, "unexpected indent")
} else if expected.as_deref() == Some("Indent") {
write!(f, "expected an indented block")
} else {
write!(f, "invalid syntax. Got unexpected token {tok}")
}
}
ParseErrorType::Lexical(ref error) => write!(f, "{error}"),
}
}
}
impl ParseErrorType {
/// Returns true if the error is an indentation error.
pub fn is_indentation_error(&self) -> bool {
match self {
ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
ParseErrorType::UnrecognizedToken(token, expected) => {
*token == Tok::Indent || expected.clone() == Some("Indent".to_owned())
}
_ => false,
}
}
/// Returns true if the error is a tab error.
pub fn is_tab_error(&self) -> bool {
matches!(
self,
ParseErrorType::Lexical(LexicalErrorType::TabError)
| ParseErrorType::Lexical(LexicalErrorType::TabsAfterSpaces)
)
}
} }
#[cfg(test)] #[cfg(test)]

View file

@ -1,8 +1,6 @@
use crate::{lexer::LexResult, mode::Mode, token::Tok};
use itertools::{Itertools, MultiPeek}; use itertools::{Itertools, MultiPeek};
use crate::lexer::{LexResult, Tok};
pub use crate::mode::Mode;
/// An [`Iterator`] that transforms a token stream to accommodate soft keywords (namely, `match` /// An [`Iterator`] that transforms a token stream to accommodate soft keywords (namely, `match`
/// and `case`). /// and `case`).
/// ///
@ -27,9 +25,9 @@ impl<I> SoftKeywordTransformer<I>
where where
I: Iterator<Item = LexResult>, I: Iterator<Item = LexResult>,
{ {
pub fn new(tokenizer: I, mode: Mode) -> Self { pub fn new(lexer: I, mode: Mode) -> Self {
Self { Self {
underlying: tokenizer.multipeek(), underlying: lexer.multipeek(),
start_of_line: matches!(mode, Mode::Interactive | Mode::Module), start_of_line: matches!(mode, Mode::Interactive | Mode::Module),
} }
} }

View file

@ -3,22 +3,19 @@
// The lexer doesn't do any special handling of f-strings, it just treats them as // The lexer doesn't do any special handling of f-strings, it just treats them as
// regular strings. Since the parser has no definition of f-string formats (Pending PEP 701) // regular strings. Since the parser has no definition of f-string formats (Pending PEP 701)
// we have to do the parsing here, manually. // we have to do the parsing here, manually.
use itertools::Itertools;
use self::FStringErrorType::*;
use crate::{ use crate::{
ast::{Constant, ConversionFlag, Expr, ExprKind, Location}, ast::{Constant, ConversionFlag, Expr, ExprKind, Location},
error::{FStringError, FStringErrorType, LexicalError, LexicalErrorType, ParseError}, lexer::{LexicalError, LexicalErrorType},
parser::parse_expression_located, parser::{parse_expression_located, LalrpopError, ParseError, ParseErrorType},
token::StringKind, token::{StringKind, Tok},
}; };
use std::{iter, str}; use itertools::Itertools;
// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798 // unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798
const MAX_UNICODE_NAME: usize = 88; const MAX_UNICODE_NAME: usize = 88;
struct StringParser<'a> { struct StringParser<'a> {
chars: iter::Peekable<str::Chars<'a>>, chars: std::iter::Peekable<std::str::Chars<'a>>,
kind: StringKind, kind: StringKind,
start: Location, start: Location,
end: Location, end: Location,
@ -178,6 +175,8 @@ impl<'a> StringParser<'a> {
} }
fn parse_formatted_value(&mut self, nested: u8) -> Result<Vec<Expr>, LexicalError> { fn parse_formatted_value(&mut self, nested: u8) -> Result<Vec<Expr>, LexicalError> {
use FStringErrorType::*;
let mut expression = String::new(); let mut expression = String::new();
let mut spec = None; let mut spec = None;
let mut delims = Vec::new(); let mut delims = Vec::new();
@ -403,6 +402,8 @@ impl<'a> StringParser<'a> {
} }
fn parse_fstring(&mut self, nested: u8) -> Result<Vec<Expr>, LexicalError> { fn parse_fstring(&mut self, nested: u8) -> Result<Vec<Expr>, LexicalError> {
use FStringErrorType::*;
if nested >= 2 { if nested >= 2 {
return Err(FStringError::new(ExpressionNestedTooDeeply, self.get_pos()).into()); return Err(FStringError::new(ExpressionNestedTooDeeply, self.get_pos()).into());
} }
@ -651,6 +652,108 @@ pub(crate) fn parse_strings(
)) ))
} }
// TODO: consolidate these with ParseError
/// An error that occurred during parsing of an f-string.
#[derive(Debug, PartialEq)]
struct FStringError {
/// The type of error that occurred.
pub error: FStringErrorType,
/// The location of the error.
pub location: Location,
}
impl FStringError {
/// Creates a new `FStringError` with the given error type and location.
pub fn new(error: FStringErrorType, location: Location) -> Self {
Self { error, location }
}
}
impl From<FStringError> for LexicalError {
fn from(err: FStringError) -> Self {
LexicalError {
error: LexicalErrorType::FStringError(err.error),
location: err.location,
}
}
}
/// Represents the different types of errors that can occur during parsing of an f-string.
#[derive(Debug, PartialEq)]
pub enum FStringErrorType {
/// Expected a right brace after an opened left brace.
UnclosedLbrace,
/// Expected a left brace after an ending right brace.
UnopenedRbrace,
/// Expected a right brace after a conversion flag.
ExpectedRbrace,
/// An error occurred while parsing an f-string expression.
InvalidExpression(Box<ParseErrorType>),
/// An invalid conversion flag was encountered.
InvalidConversionFlag,
/// An empty expression was encountered.
EmptyExpression,
/// An opening delimiter was not closed properly.
MismatchedDelimiter(char, char),
/// Too many nested expressions in an f-string.
ExpressionNestedTooDeeply,
/// The f-string expression cannot include the given character.
ExpressionCannotInclude(char),
/// A single right brace was encountered.
SingleRbrace,
/// A closing delimiter was not opened properly.
Unmatched(char),
// TODO: Test this case.
/// Unterminated string.
UnterminatedString,
}
impl std::fmt::Display for FStringErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
use FStringErrorType::*;
match self {
UnclosedLbrace => write!(f, "expecting '}}'"),
UnopenedRbrace => write!(f, "Unopened '}}'"),
ExpectedRbrace => write!(f, "Expected '}}' after conversion flag."),
InvalidExpression(error) => {
write!(f, "{error}")
}
InvalidConversionFlag => write!(f, "invalid conversion character"),
EmptyExpression => write!(f, "empty expression not allowed"),
MismatchedDelimiter(first, second) => write!(
f,
"closing parenthesis '{second}' does not match opening parenthesis '{first}'"
),
SingleRbrace => write!(f, "single '}}' is not allowed"),
Unmatched(delim) => write!(f, "unmatched '{delim}'"),
ExpressionNestedTooDeeply => {
write!(f, "expressions nested too deeply")
}
UnterminatedString => {
write!(f, "unterminated string")
}
ExpressionCannotInclude(c) => {
if *c == '\\' {
write!(f, "f-string expression part cannot include a backslash")
} else {
write!(f, "f-string expression part cannot include '{c}'s")
}
}
}
}
}
impl From<FStringError> for LalrpopError<Location, Tok, LexicalError> {
fn from(err: FStringError) -> Self {
lalrpop_util::ParseError::User {
error: LexicalError {
error: LexicalErrorType::FStringError(err.error),
location: err.location,
},
}
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -732,6 +835,7 @@ mod tests {
#[test] #[test]
fn test_parse_invalid_fstring() { fn test_parse_invalid_fstring() {
use FStringErrorType::*;
assert_eq!(parse_fstring_error("{5!a"), UnclosedLbrace); assert_eq!(parse_fstring_error("{5!a"), UnclosedLbrace);
assert_eq!(parse_fstring_error("{5!a1}"), UnclosedLbrace); assert_eq!(parse_fstring_error("{5!a1}"), UnclosedLbrace);
assert_eq!(parse_fstring_error("{5!"), UnclosedLbrace); assert_eq!(parse_fstring_error("{5!"), UnclosedLbrace);

View file

@ -1,8 +1,7 @@
use rustpython_codegen::{compile, symboltable}; use rustpython_codegen::{compile, symboltable};
use rustpython_parser::{ use rustpython_parser::{
self as parser,
ast::{fold::Fold, ConstantOptimizer}, ast::{fold::Fold, ConstantOptimizer},
error::ParseErrorType,
parser,
}; };
pub use rustpython_codegen::compile::CompileOpts; pub use rustpython_codegen::compile::CompileOpts;
@ -13,13 +12,13 @@ pub enum CompileErrorType {
#[error(transparent)] #[error(transparent)]
Codegen(#[from] rustpython_codegen::error::CodegenErrorType), Codegen(#[from] rustpython_codegen::error::CodegenErrorType),
#[error(transparent)] #[error(transparent)]
Parse(#[from] rustpython_parser::error::ParseErrorType), Parse(#[from] parser::ParseErrorType),
} }
pub type CompileError = rustpython_compiler_core::CompileError<CompileErrorType>; pub type CompileError = rustpython_compiler_core::CompileError<CompileErrorType>;
fn error_from_parse(error: rustpython_parser::error::ParseError, source: &str) -> CompileError { fn error_from_parse(error: parser::ParseError, source: &str) -> CompileError {
let error: CompileErrorBody<ParseErrorType> = error.into(); let error: CompileErrorBody<parser::ParseErrorType> = error.into();
CompileError::from(error, source) CompileError::from(error, source)
} }