Replace LALRPOP parser with hand-written parser (#10036)

(Supersedes #9152, authored by @LaBatata101)

## Summary

This PR replaces the current parser generated from LALRPOP to a
hand-written recursive descent parser.

It also updates the grammar for [PEP
646](https://peps.python.org/pep-0646/) so that the parser outputs the
correct AST. For example, in `data[*x]`, the index expression is now a
tuple with a single starred expression instead of just a starred
expression.

Beyond the performance improvements, the parser is also error resilient
and can provide better error messages. The behavior as seen by any
downstream tools isn't changed. That is, the linter and formatter can
still assume that the parser will _stop_ at the first syntax error. This
will be updated in the following months.

For more details about the change here, refer to the PR corresponding to
the individual commits and the release blog post.

## Test Plan

Write _lots_ and _lots_ of tests for both valid and invalid syntax and
verify the output.

## Acknowledgements

- @MichaReiser for reviewing 100+ parser PRs and continuously providing
guidance throughout the project
- @LaBatata101 for initiating the transition to a hand-written parser in
#9152
- @addisoncrump for implementing the fuzzer which helped
[catch](https://github.com/astral-sh/ruff/pull/10903)
[a](https://github.com/astral-sh/ruff/pull/10910)
[lot](https://github.com/astral-sh/ruff/pull/10966)
[of](https://github.com/astral-sh/ruff/pull/10896)
[bugs](https://github.com/astral-sh/ruff/pull/10877)

---------

Co-authored-by: Victor Hugo Gomes <labatata101@linuxmail.org>
Co-authored-by: Micha Reiser <micha@reiser.io>
This commit is contained in:
Dhruv Manilawala 2024-04-18 17:57:39 +05:30 committed by GitHub
parent e09180b1df
commit 13ffb5bc19
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
852 changed files with 112948 additions and 103620 deletions

View file

@ -6,8 +6,8 @@ use ruff_python_ast::{self as ast, AnyStringKind, Expr};
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::lexer::{LexicalError, LexicalErrorType};
use crate::token::Tok;
#[derive(Debug)]
pub(crate) enum StringType {
Str(ast::StringLiteral),
Bytes(ast::BytesLiteral),
@ -40,10 +40,15 @@ enum EscapedChar {
}
struct StringParser {
/// The raw content of the string e.g., the `foo` part in `"foo"`.
source: Box<str>,
/// Current position of the parser in the source.
cursor: usize,
/// The kind of string.
kind: AnyStringKind,
/// The location of the first character in the source from the start of the file.
offset: TextSize,
/// The range of the string literal.
range: TextRange,
}
@ -65,9 +70,16 @@ impl StringParser {
skipped_str
}
/// Returns the current position of the parser considering the offset.
#[inline]
fn get_pos(&self) -> TextSize {
self.offset + TextSize::try_from(self.cursor).unwrap()
fn position(&self) -> TextSize {
self.compute_position(self.cursor)
}
/// Computes the position of the cursor considering the offset.
#[inline]
fn compute_position(&self, cursor: usize) -> TextSize {
self.offset + TextSize::try_from(cursor).unwrap()
}
/// Returns the next byte in the string, if there is one.
@ -98,19 +110,32 @@ impl StringParser {
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
let mut p: u32 = 0u32;
let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
for i in 1..=literal_number {
let start = self.position();
match self.next_char() {
Some(c) => match c.to_digit(16) {
Some(d) => p += d << ((literal_number - i) * 4),
None => return Err(unicode_error),
None => {
return Err(LexicalError::new(
LexicalErrorType::UnicodeError,
TextRange::at(start, TextSize::try_from(c.len_utf8()).unwrap()),
));
}
},
None => return Err(unicode_error),
None => {
return Err(LexicalError::new(
LexicalErrorType::UnicodeError,
TextRange::empty(self.position()),
))
}
}
}
match p {
0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER),
_ => std::char::from_u32(p).ok_or(unicode_error),
_ => std::char::from_u32(p).ok_or(LexicalError::new(
LexicalErrorType::UnicodeError,
TextRange::empty(self.position()),
)),
}
}
@ -134,33 +159,45 @@ impl StringParser {
}
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
let start_pos = self.get_pos();
let start_pos = self.position();
let Some('{') = self.next_char() else {
return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
return Err(LexicalError::new(
LexicalErrorType::MissingUnicodeLbrace,
TextRange::empty(start_pos),
));
};
let start_pos = self.get_pos();
let start_pos = self.position();
let Some(close_idx) = self.source[self.cursor..].find('}') else {
return Err(LexicalError::new(
LexicalErrorType::StringError,
self.get_pos(),
LexicalErrorType::MissingUnicodeRbrace,
TextRange::empty(self.compute_position(self.source.len())),
));
};
let name_and_ending = self.skip_bytes(close_idx + 1);
let name = &name_and_ending[..name_and_ending.len() - 1];
unicode_names2::character(name)
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
unicode_names2::character(name).ok_or_else(|| {
LexicalError::new(
LexicalErrorType::UnicodeError,
// The cursor is right after the `}` character, so we subtract 1 to get the correct
// range of the unicode name.
TextRange::new(
start_pos,
self.compute_position(self.cursor - '}'.len_utf8()),
),
)
})
}
/// Parse an escaped character, returning the new character.
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
let Some(first_char) = self.next_char() else {
// TODO: check when this error case happens
return Err(LexicalError::new(
LexicalErrorType::StringError,
self.get_pos(),
TextRange::empty(self.position()),
));
};
@ -189,32 +226,19 @@ impl StringParser {
return Ok(None);
}
_ => {
if self.kind.is_byte_string() && !first_char.is_ascii() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters"
.to_string()
.into_boxed_str(),
),
self.get_pos(),
));
}
return Ok(Some(EscapedChar::Escape(first_char)));
}
_ => return Ok(Some(EscapedChar::Escape(first_char))),
};
Ok(Some(EscapedChar::Literal(new_char)))
}
fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
fn parse_fstring_middle(mut self) -> Result<ast::FStringLiteralElement, LexicalError> {
// Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
return Ok(ast::FStringLiteralElement {
value: self.source,
range: self.range,
}));
});
};
let mut value = String::with_capacity(self.source.len());
@ -257,18 +281,22 @@ impl StringParser {
// This is still an invalid escape sequence, but we don't want to
// raise a syntax error as is done by the CPython parser. It might
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
b'\\' if !self.kind.is_raw_string() && self.peek_byte().is_some() => {
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
b'\\' => {
if !self.kind.is_raw_string() && self.peek_byte().is_some() {
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
}
}
} else {
value.push('\\');
}
}
ch => {
value.push(char::from(*ch));
unreachable!("Expected '{{', '}}', or '\\' but got {:?}", ch);
}
}
@ -284,21 +312,21 @@ impl StringParser {
index = next_index;
}
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
Ok(ast::FStringLiteralElement {
value: value.into_boxed_str(),
range: self.range,
}))
})
}
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
let ch = self.source.chars().nth(index).unwrap();
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters"
.to_string()
.into_boxed_str(),
LexicalErrorType::InvalidByteLiteral,
TextRange::at(
self.compute_position(index),
TextSize::try_from(ch.len_utf8()).unwrap(),
),
self.offset + TextSize::try_from(index).unwrap(),
));
}
@ -430,156 +458,19 @@ pub(crate) fn parse_string_literal(
StringParser::new(source, kind, range.start() + kind.opener_len(), range).parse()
}
// TODO(dhruvmanila): Move this to the new parser
pub(crate) fn parse_fstring_literal_element(
source: Box<str>,
kind: AnyStringKind,
range: TextRange,
) -> Result<ast::FStringElement, LexicalError> {
) -> Result<ast::FStringLiteralElement, LexicalError> {
StringParser::new(source, kind, range.start(), range).parse_fstring_middle()
}
pub(crate) fn concatenated_strings(
strings: Vec<StringType>,
range: TextRange,
) -> Result<Expr, LexicalError> {
#[cfg(debug_assertions)]
debug_assert!(strings.len() > 1);
let mut has_fstring = false;
let mut byte_literal_count = 0;
for string in &strings {
match string {
StringType::FString(_) => has_fstring = true,
StringType::Bytes(_) => byte_literal_count += 1,
StringType::Str(_) => {}
}
}
let has_bytes = byte_literal_count > 0;
if has_bytes && byte_literal_count < strings.len() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"cannot mix bytes and non-bytes literals"
.to_string()
.into_boxed_str(),
),
range.start(),
));
}
if has_bytes {
let mut values = Vec::with_capacity(strings.len());
for string in strings {
match string {
StringType::Bytes(value) => values.push(value),
_ => unreachable!("Unexpected non-bytes literal."),
}
}
return Ok(Expr::from(ast::ExprBytesLiteral {
value: ast::BytesLiteralValue::concatenated(values),
range,
}));
}
if !has_fstring {
let mut values = Vec::with_capacity(strings.len());
for string in strings {
match string {
StringType::Str(value) => values.push(value),
_ => unreachable!("Unexpected non-string literal."),
}
}
return Ok(Expr::from(ast::ExprStringLiteral {
value: ast::StringLiteralValue::concatenated(values),
range,
}));
}
let mut parts = Vec::with_capacity(strings.len());
for string in strings {
match string {
StringType::FString(fstring) => parts.push(ast::FStringPart::FString(fstring)),
StringType::Str(string) => parts.push(ast::FStringPart::Literal(string)),
StringType::Bytes(_) => unreachable!("Unexpected bytes literal."),
}
}
Ok(ast::ExprFString {
value: ast::FStringValue::concatenated(parts),
range,
}
.into())
}
// TODO: consolidate these with ParseError
/// An error that occurred during parsing of an f-string.
#[derive(Debug, Clone, PartialEq)]
struct FStringError {
/// The type of error that occurred.
pub(crate) error: FStringErrorType,
/// The location of the error.
pub(crate) location: TextSize,
}
impl From<FStringError> for LexicalError {
fn from(err: FStringError) -> Self {
LexicalError::new(LexicalErrorType::FStringError(err.error), err.location)
}
}
/// Represents the different types of errors that can occur during parsing of an f-string.
#[derive(Copy, Debug, Clone, PartialEq)]
pub enum FStringErrorType {
/// Expected a right brace after an opened left brace.
UnclosedLbrace,
/// An invalid conversion flag was encountered.
InvalidConversionFlag,
/// A single right brace was encountered.
SingleRbrace,
/// Unterminated string.
UnterminatedString,
/// Unterminated triple-quoted string.
UnterminatedTripleQuotedString,
// TODO(dhruvmanila): The parser can't catch all cases of this error, but
// wherever it can, we'll display the correct error message.
/// A lambda expression without parentheses was encountered.
LambdaWithoutParentheses,
}
impl std::fmt::Display for FStringErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
use FStringErrorType::{
InvalidConversionFlag, LambdaWithoutParentheses, SingleRbrace, UnclosedLbrace,
UnterminatedString, UnterminatedTripleQuotedString,
};
match self {
UnclosedLbrace => write!(f, "expecting '}}'"),
InvalidConversionFlag => write!(f, "invalid conversion character"),
SingleRbrace => write!(f, "single '}}' is not allowed"),
UnterminatedString => write!(f, "unterminated string"),
UnterminatedTripleQuotedString => write!(f, "unterminated triple-quoted string"),
LambdaWithoutParentheses => {
write!(f, "lambda expressions are not allowed without parentheses")
}
}
}
}
impl From<FStringError> for crate::parser::LalrpopError<TextSize, Tok, LexicalError> {
fn from(err: FStringError) -> Self {
lalrpop_util::ParseError::User {
error: LexicalError::new(LexicalErrorType::FStringError(err.error), err.location),
}
}
}
#[cfg(test)]
mod tests {
use crate::lexer::LexicalErrorType;
use crate::parser::parse_suite;
use crate::{ParseErrorType, Suite};
use super::*;
use crate::{parse_suite, FStringErrorType, ParseErrorType, Suite};
const WINDOWS_EOL: &str = "\r\n";
const MAC_EOL: &str = "\r";
@ -665,6 +556,7 @@ mod tests {
parse_suite(source)
.map_err(|e| match e.error {
ParseErrorType::Lexical(LexicalErrorType::FStringError(e)) => e,
ParseErrorType::FStringError(e) => e,
e => unreachable!("Expected FStringError: {:?}", e),
})
.expect_err("Expected error")
@ -679,10 +571,12 @@ mod tests {
parse_fstring_error("f'{lambda x:{x}}'"),
LambdaWithoutParentheses
);
assert_eq!(
parse_fstring_error("f'{lambda x: {x}}'"),
LambdaWithoutParentheses
);
// NOTE: The parser produces the `LambdaWithoutParentheses` for this case, but
// since the parser only return the first error to maintain compatibility with
// the rest of the codebase, this test case fails. The `LambdaWithoutParentheses`
// error appears after the unexpected `FStringMiddle` token, which is between the
// `:` and the `{`.
// assert_eq!(parse_fstring_error("f'{lambda x: {x}}'"), LambdaWithoutParentheses);
assert!(parse_suite(r#"f"{class}""#,).is_err());
}
@ -912,6 +806,46 @@ mod tests {
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_invalid_unicode_literal() {
let source = r"'\x1ó34'";
let error = parse_suite(source).unwrap_err();
insta::assert_debug_snapshot!(error);
}
#[test]
fn test_missing_unicode_lbrace_error() {
let source = r"'\N '";
let error = parse_suite(source).unwrap_err();
insta::assert_debug_snapshot!(error);
}
#[test]
fn test_missing_unicode_rbrace_error() {
let source = r"'\N{SPACE'";
let error = parse_suite(source).unwrap_err();
insta::assert_debug_snapshot!(error);
}
#[test]
fn test_invalid_unicode_name_error() {
let source = r"'\N{INVALID}'";
let error = parse_suite(source).unwrap_err();
insta::assert_debug_snapshot!(error);
}
#[test]
fn test_invalid_byte_literal_error() {
let source = r"b'123a𝐁c'";
let error = parse_suite(source).unwrap_err();
insta::assert_debug_snapshot!(error);
}
macro_rules! test_aliases_parse {
($($name:ident: $alias:expr,)*) => {
$(