diff --git a/parser/src/lib.rs b/parser/src/lib.rs index ce9dde1..72dcae6 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -27,9 +27,8 @@ mod function; pub mod lexer; pub mod mode; pub mod parser; -mod string_parser; +mod string; #[rustfmt::skip] mod python; mod context; -mod string; pub mod token; diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_base.snap b/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_base.snap similarity index 96% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_base.snap rename to parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_base.snap index de77be8..cca26ad 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_base.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_base.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 698 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_base_more.snap b/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_base_more.snap similarity index 98% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_base_more.snap rename to parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_base_more.snap index e7f5bff..c425d67 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_base_more.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_base_more.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 706 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_format.snap b/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_format.snap similarity index 98% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_format.snap rename to parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_format.snap index 5efa6f6..c402ed6 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__fstring_parse_selfdocumenting_format.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__fstring_parse_selfdocumenting_format.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 714 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string__tests__parse_empty_fstring.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_empty_fstring.snap new file mode 100644 index 0000000..073ac42 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_empty_fstring.snap @@ -0,0 +1,6 @@ +--- +source: compiler/parser/src/string.rs +assertion_line: 690 +expression: "parse_fstring(\"\").unwrap()" +--- +[] diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring.snap similarity index 97% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring.snap index 32b63f9..10b6f3a 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 669 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_equals.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_equals.snap similarity index 97% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_equals.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_equals.snap index e9d8120..194061c 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_equals.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_equals.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 766 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_nested_spec.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_nested_spec.snap similarity index 98% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_nested_spec.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_nested_spec.snap index 268a16e..96ec3e7 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_nested_spec.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_nested_spec.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 677 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_not_equals.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_equals.snap similarity index 97% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_not_equals.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_equals.snap index aaa2ad1..cbfe435 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_not_equals.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_equals.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 759 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_not_nested_spec.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_nested_spec.snap similarity index 97% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_not_nested_spec.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_nested_spec.snap index 0d1107e..2adc5a4 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_not_nested_spec.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_not_nested_spec.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 685 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_selfdoc_prec_space.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_selfdoc_prec_space.snap similarity index 96% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_selfdoc_prec_space.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_selfdoc_prec_space.snap index 7851495..16d62ea 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_selfdoc_prec_space.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_selfdoc_prec_space.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 773 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_selfdoc_trailing_space.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_selfdoc_trailing_space.snap similarity index 96% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_selfdoc_trailing_space.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_selfdoc_trailing_space.snap index 621e499..16171f2 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_selfdoc_trailing_space.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_selfdoc_trailing_space.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 780 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_yield_expr.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_yield_expr.snap similarity index 93% rename from parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_yield_expr.snap rename to parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_yield_expr.snap index 62a0cf4..e3a9ff7 100644 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_fstring_yield_expr.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_fstring_yield_expr.snap @@ -1,5 +1,6 @@ --- -source: compiler/parser/src/string_parser.rs +source: compiler/parser/src/string.rs +assertion_line: 787 expression: parse_ast --- [ diff --git a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_empty_fstring.snap b/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_empty_fstring.snap deleted file mode 100644 index fbc08b4..0000000 --- a/parser/src/snapshots/rustpython_parser__string_parser__tests__parse_empty_fstring.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: compiler/parser/src/string_parser.rs -expression: "parse_fstring(\"\").unwrap()" ---- -[] diff --git a/parser/src/string.rs b/parser/src/string.rs index 027d4a9..84c6fe2 100644 --- a/parser/src/string.rs +++ b/parser/src/string.rs @@ -1,12 +1,545 @@ -use crate::{ - ast::{Constant, Expr, ExprKind, Location}, - error::{LexicalError, LexicalErrorType}, - string_parser::parse_string, - token::StringKind, -}; +// Contains the logic for parsing string literals (mostly concerned with f-strings.) +// +// The lexer doesn't do any special handling of f-strings, it just treats them as +// regular strings. Since the parser has no definition of f-string formats (Pending PEP 701) +// we have to do the parsing here, manually. use itertools::Itertools; -pub fn parse_strings( +use self::FStringErrorType::*; +use crate::{ + ast::{Constant, ConversionFlag, Expr, ExprKind, Location}, + error::{FStringError, FStringErrorType, LexicalError, LexicalErrorType, ParseError}, + parser::parse_expression_located, + token::StringKind, +}; +use std::{iter, str}; + +// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798 +const MAX_UNICODE_NAME: usize = 88; + +struct StringParser<'a> { + chars: iter::Peekable>, + kind: StringKind, + start: Location, + end: Location, + location: Location, +} + +impl<'a> StringParser<'a> { + fn new( + source: &'a str, + kind: StringKind, + triple_quoted: bool, + start: Location, + end: Location, + ) -> Self { + let offset = kind.prefix_len() + if triple_quoted { 3 } else { 1 }; + Self { + chars: source.chars().peekable(), + kind, + start, + end, + location: start.with_col_offset(offset), + } + } + + #[inline] + fn next_char(&mut self) -> Option { + let Some(c) = self.chars.next() else { + return None + }; + if c == '\n' { + self.location.newline(); + } else { + self.location.go_right(); + } + Some(c) + } + + #[inline] + fn peek(&mut self) -> Option<&char> { + self.chars.peek() + } + + #[inline] + fn get_pos(&self) -> Location { + self.location + } + + #[inline] + fn expr(&self, node: ExprKind) -> Expr { + Expr::new(self.start, self.end, node) + } + + fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { + let mut p: u32 = 0u32; + let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos()); + for i in 1..=literal_number { + match self.next_char() { + Some(c) => match c.to_digit(16) { + Some(d) => p += d << ((literal_number - i) * 4), + None => return Err(unicode_error), + }, + None => return Err(unicode_error), + } + } + match p { + 0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER), + _ => std::char::from_u32(p).ok_or(unicode_error), + } + } + + fn parse_octet(&mut self, first: char) -> char { + let mut octet_content = String::new(); + octet_content.push(first); + while octet_content.len() < 3 { + if let Some('0'..='7') = self.peek() { + octet_content.push(self.next_char().unwrap()) + } else { + break; + } + } + let value = u32::from_str_radix(&octet_content, 8).unwrap(); + char::from_u32(value).unwrap() + } + + fn parse_unicode_name(&mut self) -> Result { + let start_pos = self.get_pos(); + match self.next_char() { + Some('{') => {} + _ => return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)), + } + let start_pos = self.get_pos(); + let mut name = String::new(); + loop { + match self.next_char() { + Some('}') => break, + Some(c) => name.push(c), + None => { + return Err(LexicalError::new( + LexicalErrorType::StringError, + self.get_pos(), + )) + } + } + } + + if name.len() > MAX_UNICODE_NAME { + return Err(LexicalError::new( + LexicalErrorType::UnicodeError, + self.get_pos(), + )); + } + + unicode_names2::character(&name) + .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) + } + + fn parse_escaped_char(&mut self) -> Result { + match self.next_char() { + Some(c) => { + let char = match c { + '\\' => '\\', + '\'' => '\'', + '\"' => '"', + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0c', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'v' => '\x0b', + o @ '0'..='7' => self.parse_octet(o), + 'x' => self.parse_unicode_literal(2)?, + 'u' if !self.kind.is_bytes() => self.parse_unicode_literal(4)?, + 'U' if !self.kind.is_bytes() => self.parse_unicode_literal(8)?, + 'N' if !self.kind.is_bytes() => self.parse_unicode_name()?, + // Special cases where the escape sequence is not a single character + '\n' => return Ok("".to_string()), + c => { + if self.kind.is_bytes() && !c.is_ascii() { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters".to_owned(), + ), + location: self.get_pos(), + }); + } + return Ok(format!("\\{c}")); + } + }; + Ok(char.to_string()) + } + None => Err(LexicalError { + error: LexicalErrorType::StringError, + location: self.get_pos(), + }), + } + } + + fn parse_formatted_value(&mut self, nested: u8) -> Result, LexicalError> { + let mut expression = String::new(); + let mut spec = None; + let mut delims = Vec::new(); + let mut conversion = ConversionFlag::None; + let mut self_documenting = false; + let mut trailing_seq = String::new(); + let location = self.get_pos(); + + while let Some(ch) = self.next_char() { + match ch { + // can be integrated better with the remaining code, but as a starting point ok + // in general I would do here a tokenizing of the fstrings to omit this peeking. + '!' | '=' | '>' | '<' if self.peek() == Some(&'=') => { + expression.push(ch); + expression.push('='); + self.next_char(); + } + '!' if delims.is_empty() && self.peek() != Some(&'=') => { + if expression.trim().is_empty() { + return Err(FStringError::new(EmptyExpression, self.get_pos()).into()); + } + + conversion = match self.next_char() { + Some('s') => ConversionFlag::Str, + Some('a') => ConversionFlag::Ascii, + Some('r') => ConversionFlag::Repr, + Some(_) => { + return Err( + FStringError::new(InvalidConversionFlag, self.get_pos()).into() + ); + } + None => { + return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()); + } + }; + + match self.peek() { + Some('}' | ':') => {} + Some(_) | None => { + return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()); + } + } + } + + // match a python 3.8 self documenting expression + // format '{' PYTHON_EXPRESSION '=' FORMAT_SPECIFIER? '}' + '=' if self.peek() != Some(&'=') && delims.is_empty() => { + self_documenting = true; + } + + ':' if delims.is_empty() => { + let parsed_spec = self.parse_spec(nested)?; + + spec = Some(Box::new(self.expr(ExprKind::JoinedStr { + values: parsed_spec, + }))); + } + '(' | '{' | '[' => { + expression.push(ch); + delims.push(ch); + } + ')' => { + let last_delim = delims.pop(); + match last_delim { + Some('(') => { + expression.push(ch); + } + Some(c) => { + return Err(FStringError::new( + MismatchedDelimiter(c, ')'), + self.get_pos(), + ) + .into()); + } + None => { + return Err(FStringError::new(Unmatched(')'), self.get_pos()).into()); + } + } + } + ']' => { + let last_delim = delims.pop(); + match last_delim { + Some('[') => { + expression.push(ch); + } + Some(c) => { + return Err(FStringError::new( + MismatchedDelimiter(c, ']'), + self.get_pos(), + ) + .into()); + } + None => { + return Err(FStringError::new(Unmatched(']'), self.get_pos()).into()); + } + } + } + '}' if !delims.is_empty() => { + let last_delim = delims.pop(); + match last_delim { + Some('{') => { + expression.push(ch); + } + Some(c) => { + return Err(FStringError::new( + MismatchedDelimiter(c, '}'), + self.get_pos(), + ) + .into()); + } + None => {} + } + } + '}' => { + if expression.trim().is_empty() { + return Err(FStringError::new(EmptyExpression, self.get_pos()).into()); + } + + let ret = if !self_documenting { + vec![self.expr(ExprKind::FormattedValue { + value: Box::new(parse_fstring_expr(&expression, location).map_err( + |e| { + FStringError::new( + InvalidExpression(Box::new(e.error)), + location, + ) + }, + )?), + conversion: conversion as _, + format_spec: spec, + })] + } else { + vec![ + self.expr(ExprKind::Constant { + value: Constant::Str(expression.to_owned() + "="), + kind: None, + }), + self.expr(ExprKind::Constant { + value: trailing_seq.into(), + kind: None, + }), + self.expr(ExprKind::FormattedValue { + value: Box::new( + parse_fstring_expr(&expression, location).map_err(|e| { + FStringError::new( + InvalidExpression(Box::new(e.error)), + location, + ) + })?, + ), + conversion: (if conversion == ConversionFlag::None && spec.is_none() + { + ConversionFlag::Repr + } else { + conversion + }) as _, + format_spec: spec, + }), + ] + }; + return Ok(ret); + } + '"' | '\'' => { + expression.push(ch); + loop { + let Some(c) = self.next_char() else { + return Err(FStringError::new(UnterminatedString, self.get_pos()).into()); + }; + expression.push(c); + if c == ch { + break; + } + } + } + ' ' if self_documenting => { + trailing_seq.push(ch); + } + '\\' => return Err(FStringError::new(UnterminatedString, self.get_pos()).into()), + _ => { + if self_documenting { + return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()); + } + + expression.push(ch); + } + } + } + Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()) + } + + fn parse_spec(&mut self, nested: u8) -> Result, LexicalError> { + let mut spec_constructor = Vec::new(); + let mut constant_piece = String::new(); + while let Some(&next) = self.peek() { + match next { + '{' => { + if !constant_piece.is_empty() { + spec_constructor.push(self.expr(ExprKind::Constant { + value: constant_piece.drain(..).collect::().into(), + kind: None, + })); + } + let parsed_expr = self.parse_fstring(nested + 1)?; + spec_constructor.extend(parsed_expr); + continue; + } + '}' => { + break; + } + _ => { + constant_piece.push(next); + } + } + self.next_char(); + } + if !constant_piece.is_empty() { + spec_constructor.push(self.expr(ExprKind::Constant { + value: constant_piece.drain(..).collect::().into(), + kind: None, + })); + } + Ok(spec_constructor) + } + + fn parse_fstring(&mut self, nested: u8) -> Result, LexicalError> { + if nested >= 2 { + return Err(FStringError::new(ExpressionNestedTooDeeply, self.get_pos()).into()); + } + + let mut content = String::new(); + let mut values = vec![]; + + while let Some(&ch) = self.peek() { + match ch { + '{' => { + self.next_char(); + if nested == 0 { + match self.peek() { + Some('{') => { + self.next_char(); + content.push('{'); + continue; + } + None => { + return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()) + } + _ => {} + } + } + if !content.is_empty() { + values.push(self.expr(ExprKind::Constant { + value: content.drain(..).collect::().into(), + kind: None, + })); + } + + let parsed_values = self.parse_formatted_value(nested)?; + values.extend(parsed_values); + } + '}' => { + if nested > 0 { + break; + } + self.next_char(); + if let Some('}') = self.peek() { + self.next_char(); + content.push('}'); + } else { + return Err(FStringError::new(SingleRbrace, self.get_pos()).into()); + } + } + '\\' if !self.kind.is_raw() => { + self.next_char(); + content.push_str(&self.parse_escaped_char()?); + } + _ => { + content.push(ch); + self.next_char(); + } + } + } + + if !content.is_empty() { + values.push(self.expr(ExprKind::Constant { + value: content.into(), + kind: None, + })) + } + + Ok(values) + } + + fn parse_bytes(&mut self) -> Result { + let mut content = String::new(); + while let Some(ch) = self.next_char() { + match ch { + '\\' if !self.kind.is_raw() => { + content.push_str(&self.parse_escaped_char()?); + } + ch => { + if !ch.is_ascii() { + return Err(LexicalError::new( + LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters".to_string(), + ), + self.get_pos(), + )); + } + content.push(ch); + } + } + } + + Ok(self.expr(ExprKind::Constant { + value: Constant::Bytes(content.chars().map(|c| c as u8).collect()), + kind: None, + })) + } + + fn parse_string(&mut self) -> Result { + let mut content = String::new(); + while let Some(ch) = self.next_char() { + match ch { + '\\' if !self.kind.is_raw() => { + content.push_str(&self.parse_escaped_char()?); + } + ch => content.push(ch), + } + } + Ok(self.expr(ExprKind::Constant { + value: Constant::Str(content), + kind: self.kind.is_unicode().then(|| "u".to_string()), + })) + } + + fn parse(&mut self) -> Result, LexicalError> { + if self.kind.is_fstring() { + self.parse_fstring(0) + } else if self.kind.is_bytes() { + self.parse_bytes().map(|expr| vec![expr]) + } else { + self.parse_string().map(|expr| vec![expr]) + } + } +} + +fn parse_fstring_expr(source: &str, location: Location) -> Result { + let fstring_body = format!("({source})"); + parse_expression_located(&fstring_body, "", location.with_col_offset(-1)) +} + +fn parse_string( + source: &str, + kind: StringKind, + triple_quoted: bool, + start: Location, + end: Location, +) -> Result, LexicalError> { + StringParser::new(source, kind, triple_quoted, start, end).parse() +} + +pub(crate) fn parse_strings( values: Vec<(Location, (String, StringKind, bool), Location)>, ) -> Result { // Preserve the initial location and kind. @@ -120,8 +653,146 @@ pub fn parse_strings( #[cfg(test)] mod tests { + use super::*; use crate::parser::parse_program; + fn parse_fstring(source: &str) -> Result, LexicalError> { + StringParser::new( + source, + StringKind::FString, + false, + Location::default(), + Location::default().with_col_offset(source.len() + 3), // 3 for prefix and quotes + ) + .parse() + } + + #[test] + fn test_parse_fstring() { + let source = "{a}{ b }{{foo}}"; + let parse_ast = parse_fstring(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_nested_spec() { + let source = "{foo:{spec}}"; + let parse_ast = parse_fstring(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_not_nested_spec() { + let source = "{foo:spec}"; + let parse_ast = parse_fstring(source).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_empty_fstring() { + insta::assert_debug_snapshot!(parse_fstring("").unwrap()); + } + + #[test] + fn test_fstring_parse_selfdocumenting_base() { + let src = "{user=}"; + let parse_ast = parse_fstring(src).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_parse_selfdocumenting_base_more() { + let src = "mix {user=} with text and {second=}"; + let parse_ast = parse_fstring(src).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_parse_selfdocumenting_format() { + let src = "{user=:>10}"; + let parse_ast = parse_fstring(src).unwrap(); + + insta::assert_debug_snapshot!(parse_ast); + } + + fn parse_fstring_error(source: &str) -> FStringErrorType { + parse_fstring(source) + .map_err(|e| match e.error { + LexicalErrorType::FStringError(e) => e, + e => unreachable!("Expected FStringError: {:?}", e), + }) + .err() + .expect("Expected error") + } + + #[test] + fn test_parse_invalid_fstring() { + assert_eq!(parse_fstring_error("{5!a"), UnclosedLbrace); + assert_eq!(parse_fstring_error("{5!a1}"), UnclosedLbrace); + assert_eq!(parse_fstring_error("{5!"), UnclosedLbrace); + assert_eq!(parse_fstring_error("abc{!a 'cat'}"), EmptyExpression); + assert_eq!(parse_fstring_error("{!a"), EmptyExpression); + assert_eq!(parse_fstring_error("{ !a}"), EmptyExpression); + + assert_eq!(parse_fstring_error("{5!}"), InvalidConversionFlag); + assert_eq!(parse_fstring_error("{5!x}"), InvalidConversionFlag); + + assert_eq!( + parse_fstring_error("{a:{a:{b}}}"), + ExpressionNestedTooDeeply + ); + + assert_eq!(parse_fstring_error("{a:b}}"), SingleRbrace); + assert_eq!(parse_fstring_error("}"), SingleRbrace); + assert_eq!(parse_fstring_error("{a:{b}"), UnclosedLbrace); + assert_eq!(parse_fstring_error("{"), UnclosedLbrace); + + assert_eq!(parse_fstring_error("{}"), EmptyExpression); + + // TODO: check for InvalidExpression enum? + assert!(parse_fstring("{class}").is_err()); + } + + #[test] + fn test_parse_fstring_not_equals() { + let source = "{1 != 2}"; + let parse_ast = parse_fstring(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_equals() { + let source = "{42 == 42}"; + let parse_ast = parse_fstring(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_selfdoc_prec_space() { + let source = "{x =}"; + let parse_ast = parse_fstring(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_selfdoc_trailing_space() { + let source = "{x= }"; + let parse_ast = parse_fstring(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_parse_fstring_yield_expr() { + let source = "{yield}"; + let parse_ast = parse_fstring(source).unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + #[test] fn test_parse_string_concat() { let source = "'Hello ' 'world'"; diff --git a/parser/src/string_parser.rs b/parser/src/string_parser.rs deleted file mode 100644 index c18ba0f..0000000 --- a/parser/src/string_parser.rs +++ /dev/null @@ -1,675 +0,0 @@ -use self::FStringErrorType::*; -use crate::{ - ast::{Constant, ConversionFlag, Expr, ExprKind, Location}, - error::{FStringError, FStringErrorType, LexicalError, LexicalErrorType, ParseError}, - parser::parse_expression_located, - token::StringKind, -}; -use std::{iter, str}; - -/// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798 -pub const MAX_UNICODE_NAME: usize = 88; - -pub struct StringParser<'a> { - chars: iter::Peekable>, - kind: StringKind, - str_start: Location, - str_end: Location, - location: Location, -} - -impl<'a> StringParser<'a> { - pub fn new( - source: &'a str, - kind: StringKind, - triple_quoted: bool, - str_start: Location, - str_end: Location, - ) -> Self { - let offset = kind.prefix_len() + if triple_quoted { 3 } else { 1 }; - Self { - chars: source.chars().peekable(), - kind, - str_start, - str_end, - location: str_start.with_col_offset(offset), - } - } - - #[inline] - fn next_char(&mut self) -> Option { - let Some(c) = self.chars.next() else { - return None - }; - if c == '\n' { - self.location.newline(); - } else { - self.location.go_right(); - } - Some(c) - } - - #[inline] - fn peek(&mut self) -> Option<&char> { - self.chars.peek() - } - - #[inline] - fn get_pos(&self) -> Location { - self.location - } - - #[inline] - fn expr(&self, node: ExprKind) -> Expr { - Expr::new(self.str_start, self.str_end, node) - } - - fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { - let mut p: u32 = 0u32; - let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos()); - for i in 1..=literal_number { - match self.next_char() { - Some(c) => match c.to_digit(16) { - Some(d) => p += d << ((literal_number - i) * 4), - None => return Err(unicode_error), - }, - None => return Err(unicode_error), - } - } - match p { - 0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER), - _ => std::char::from_u32(p).ok_or(unicode_error), - } - } - - fn parse_octet(&mut self, first: char) -> char { - let mut octet_content = String::new(); - octet_content.push(first); - while octet_content.len() < 3 { - if let Some('0'..='7') = self.peek() { - octet_content.push(self.next_char().unwrap()) - } else { - break; - } - } - let value = u32::from_str_radix(&octet_content, 8).unwrap(); - char::from_u32(value).unwrap() - } - - fn parse_unicode_name(&mut self) -> Result { - let start_pos = self.get_pos(); - match self.next_char() { - Some('{') => {} - _ => return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)), - } - let start_pos = self.get_pos(); - let mut name = String::new(); - loop { - match self.next_char() { - Some('}') => break, - Some(c) => name.push(c), - None => { - return Err(LexicalError::new( - LexicalErrorType::StringError, - self.get_pos(), - )) - } - } - } - - if name.len() > MAX_UNICODE_NAME { - return Err(LexicalError::new( - LexicalErrorType::UnicodeError, - self.get_pos(), - )); - } - - unicode_names2::character(&name) - .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) - } - - fn parse_escaped_char(&mut self) -> Result { - match self.next_char() { - Some(c) => { - let char = match c { - '\\' => '\\', - '\'' => '\'', - '\"' => '"', - 'a' => '\x07', - 'b' => '\x08', - 'f' => '\x0c', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'v' => '\x0b', - o @ '0'..='7' => self.parse_octet(o), - 'x' => self.parse_unicode_literal(2)?, - 'u' if !self.kind.is_bytes() => self.parse_unicode_literal(4)?, - 'U' if !self.kind.is_bytes() => self.parse_unicode_literal(8)?, - 'N' if !self.kind.is_bytes() => self.parse_unicode_name()?, - // Special cases where the escape sequence is not a single character - '\n' => return Ok("".to_string()), - c => { - if self.kind.is_bytes() && !c.is_ascii() { - return Err(LexicalError { - error: LexicalErrorType::OtherError( - "bytes can only contain ASCII literal characters".to_owned(), - ), - location: self.get_pos(), - }); - } - return Ok(format!("\\{c}")); - } - }; - Ok(char.to_string()) - } - None => Err(LexicalError { - error: LexicalErrorType::StringError, - location: self.get_pos(), - }), - } - } - - fn parse_formatted_value(&mut self, nested: u8) -> Result, LexicalError> { - let mut expression = String::new(); - let mut spec = None; - let mut delims = Vec::new(); - let mut conversion = ConversionFlag::None; - let mut self_documenting = false; - let mut trailing_seq = String::new(); - let location = self.get_pos(); - - while let Some(ch) = self.next_char() { - match ch { - // can be integrated better with the remaining code, but as a starting point ok - // in general I would do here a tokenizing of the fstrings to omit this peeking. - '!' | '=' | '>' | '<' if self.peek() == Some(&'=') => { - expression.push(ch); - expression.push('='); - self.next_char(); - } - '!' if delims.is_empty() && self.peek() != Some(&'=') => { - if expression.trim().is_empty() { - return Err(FStringError::new(EmptyExpression, self.get_pos()).into()); - } - - conversion = match self.next_char() { - Some('s') => ConversionFlag::Str, - Some('a') => ConversionFlag::Ascii, - Some('r') => ConversionFlag::Repr, - Some(_) => { - return Err( - FStringError::new(InvalidConversionFlag, self.get_pos()).into() - ); - } - None => { - return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()); - } - }; - - match self.peek() { - Some('}' | ':') => {} - Some(_) | None => { - return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()); - } - } - } - - // match a python 3.8 self documenting expression - // format '{' PYTHON_EXPRESSION '=' FORMAT_SPECIFIER? '}' - '=' if self.peek() != Some(&'=') && delims.is_empty() => { - self_documenting = true; - } - - ':' if delims.is_empty() => { - let parsed_spec = self.parse_spec(nested)?; - - spec = Some(Box::new(self.expr(ExprKind::JoinedStr { - values: parsed_spec, - }))); - } - '(' | '{' | '[' => { - expression.push(ch); - delims.push(ch); - } - ')' => { - let last_delim = delims.pop(); - match last_delim { - Some('(') => { - expression.push(ch); - } - Some(c) => { - return Err(FStringError::new( - MismatchedDelimiter(c, ')'), - self.get_pos(), - ) - .into()); - } - None => { - return Err(FStringError::new(Unmatched(')'), self.get_pos()).into()); - } - } - } - ']' => { - let last_delim = delims.pop(); - match last_delim { - Some('[') => { - expression.push(ch); - } - Some(c) => { - return Err(FStringError::new( - MismatchedDelimiter(c, ']'), - self.get_pos(), - ) - .into()); - } - None => { - return Err(FStringError::new(Unmatched(']'), self.get_pos()).into()); - } - } - } - '}' if !delims.is_empty() => { - let last_delim = delims.pop(); - match last_delim { - Some('{') => { - expression.push(ch); - } - Some(c) => { - return Err(FStringError::new( - MismatchedDelimiter(c, '}'), - self.get_pos(), - ) - .into()); - } - None => {} - } - } - '}' => { - if expression.trim().is_empty() { - return Err(FStringError::new(EmptyExpression, self.get_pos()).into()); - } - - let ret = if !self_documenting { - vec![self.expr(ExprKind::FormattedValue { - value: Box::new(parse_fstring_expr(&expression, location).map_err( - |e| { - FStringError::new( - InvalidExpression(Box::new(e.error)), - location, - ) - }, - )?), - conversion: conversion as _, - format_spec: spec, - })] - } else { - vec![ - self.expr(ExprKind::Constant { - value: Constant::Str(expression.to_owned() + "="), - kind: None, - }), - self.expr(ExprKind::Constant { - value: trailing_seq.into(), - kind: None, - }), - self.expr(ExprKind::FormattedValue { - value: Box::new( - parse_fstring_expr(&expression, location).map_err(|e| { - FStringError::new( - InvalidExpression(Box::new(e.error)), - location, - ) - })?, - ), - conversion: (if conversion == ConversionFlag::None && spec.is_none() - { - ConversionFlag::Repr - } else { - conversion - }) as _, - format_spec: spec, - }), - ] - }; - return Ok(ret); - } - '"' | '\'' => { - expression.push(ch); - loop { - let Some(c) = self.next_char() else { - return Err(FStringError::new(UnterminatedString, self.get_pos()).into()); - }; - expression.push(c); - if c == ch { - break; - } - } - } - ' ' if self_documenting => { - trailing_seq.push(ch); - } - '\\' => return Err(FStringError::new(UnterminatedString, self.get_pos()).into()), - _ => { - if self_documenting { - return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()); - } - - expression.push(ch); - } - } - } - Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()) - } - - fn parse_spec(&mut self, nested: u8) -> Result, LexicalError> { - let mut spec_constructor = Vec::new(); - let mut constant_piece = String::new(); - while let Some(&next) = self.peek() { - match next { - '{' => { - if !constant_piece.is_empty() { - spec_constructor.push(self.expr(ExprKind::Constant { - value: constant_piece.drain(..).collect::().into(), - kind: None, - })); - } - let parsed_expr = self.parse_fstring(nested + 1)?; - spec_constructor.extend(parsed_expr); - continue; - } - '}' => { - break; - } - _ => { - constant_piece.push(next); - } - } - self.next_char(); - } - if !constant_piece.is_empty() { - spec_constructor.push(self.expr(ExprKind::Constant { - value: constant_piece.drain(..).collect::().into(), - kind: None, - })); - } - Ok(spec_constructor) - } - - fn parse_fstring(&mut self, nested: u8) -> Result, LexicalError> { - if nested >= 2 { - return Err(FStringError::new(ExpressionNestedTooDeeply, self.get_pos()).into()); - } - - let mut content = String::new(); - let mut values = vec![]; - - while let Some(&ch) = self.peek() { - match ch { - '{' => { - self.next_char(); - if nested == 0 { - match self.peek() { - Some('{') => { - self.next_char(); - content.push('{'); - continue; - } - None => { - return Err(FStringError::new(UnclosedLbrace, self.get_pos()).into()) - } - _ => {} - } - } - if !content.is_empty() { - values.push(self.expr(ExprKind::Constant { - value: content.drain(..).collect::().into(), - kind: None, - })); - } - - let parsed_values = self.parse_formatted_value(nested)?; - values.extend(parsed_values); - } - '}' => { - if nested > 0 { - break; - } - self.next_char(); - if let Some('}') = self.peek() { - self.next_char(); - content.push('}'); - } else { - return Err(FStringError::new(SingleRbrace, self.get_pos()).into()); - } - } - '\\' if !self.kind.is_raw() => { - self.next_char(); - content.push_str(&self.parse_escaped_char()?); - } - _ => { - content.push(ch); - self.next_char(); - } - } - } - - if !content.is_empty() { - values.push(self.expr(ExprKind::Constant { - value: content.into(), - kind: None, - })) - } - - Ok(values) - } - - pub fn parse_bytes(&mut self) -> Result { - let mut content = String::new(); - while let Some(ch) = self.next_char() { - match ch { - '\\' if !self.kind.is_raw() => { - content.push_str(&self.parse_escaped_char()?); - } - ch => { - if !ch.is_ascii() { - return Err(LexicalError::new( - LexicalErrorType::OtherError( - "bytes can only contain ASCII literal characters".to_string(), - ), - self.get_pos(), - )); - } - content.push(ch); - } - } - } - - Ok(self.expr(ExprKind::Constant { - value: Constant::Bytes(content.chars().map(|c| c as u8).collect()), - kind: None, - })) - } - - pub fn parse_string(&mut self) -> Result { - let mut content = String::new(); - while let Some(ch) = self.next_char() { - match ch { - '\\' if !self.kind.is_raw() => { - content.push_str(&self.parse_escaped_char()?); - } - ch => content.push(ch), - } - } - Ok(self.expr(ExprKind::Constant { - value: Constant::Str(content), - kind: self.kind.is_unicode().then(|| "u".to_string()), - })) - } - - pub fn parse(&mut self) -> Result, LexicalError> { - if self.kind.is_fstring() { - self.parse_fstring(0) - } else if self.kind.is_bytes() { - self.parse_bytes().map(|expr| vec![expr]) - } else { - self.parse_string().map(|expr| vec![expr]) - } - } -} - -fn parse_fstring_expr(source: &str, location: Location) -> Result { - let fstring_body = format!("({source})"); - parse_expression_located(&fstring_body, "", location.with_col_offset(-1)) -} - -pub fn parse_string( - source: &str, - kind: StringKind, - triple_quoted: bool, - start: Location, - end: Location, -) -> Result, LexicalError> { - StringParser::new(source, kind, triple_quoted, start, end).parse() -} - -#[cfg(test)] -mod tests { - use super::*; - - fn parse_fstring(source: &str) -> Result, LexicalError> { - StringParser::new( - source, - StringKind::FString, - false, - Location::default(), - Location::default().with_col_offset(source.len() + 3), // 3 for prefix and quotes - ) - .parse() - } - - #[test] - fn test_parse_fstring() { - let source = "{a}{ b }{{foo}}"; - let parse_ast = parse_fstring(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_nested_spec() { - let source = "{foo:{spec}}"; - let parse_ast = parse_fstring(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_not_nested_spec() { - let source = "{foo:spec}"; - let parse_ast = parse_fstring(source).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_empty_fstring() { - insta::assert_debug_snapshot!(parse_fstring("").unwrap()); - } - - #[test] - fn test_fstring_parse_selfdocumenting_base() { - let src = "{user=}"; - let parse_ast = parse_fstring(src).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_parse_selfdocumenting_base_more() { - let src = "mix {user=} with text and {second=}"; - let parse_ast = parse_fstring(src).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_fstring_parse_selfdocumenting_format() { - let src = "{user=:>10}"; - let parse_ast = parse_fstring(src).unwrap(); - - insta::assert_debug_snapshot!(parse_ast); - } - - fn parse_fstring_error(source: &str) -> FStringErrorType { - parse_fstring(source) - .map_err(|e| match e.error { - LexicalErrorType::FStringError(e) => e, - e => unreachable!("Expected FStringError: {:?}", e), - }) - .err() - .expect("Expected error") - } - - #[test] - fn test_parse_invalid_fstring() { - assert_eq!(parse_fstring_error("{5!a"), UnclosedLbrace); - assert_eq!(parse_fstring_error("{5!a1}"), UnclosedLbrace); - assert_eq!(parse_fstring_error("{5!"), UnclosedLbrace); - assert_eq!(parse_fstring_error("abc{!a 'cat'}"), EmptyExpression); - assert_eq!(parse_fstring_error("{!a"), EmptyExpression); - assert_eq!(parse_fstring_error("{ !a}"), EmptyExpression); - - assert_eq!(parse_fstring_error("{5!}"), InvalidConversionFlag); - assert_eq!(parse_fstring_error("{5!x}"), InvalidConversionFlag); - - assert_eq!( - parse_fstring_error("{a:{a:{b}}}"), - ExpressionNestedTooDeeply - ); - - assert_eq!(parse_fstring_error("{a:b}}"), SingleRbrace); - assert_eq!(parse_fstring_error("}"), SingleRbrace); - assert_eq!(parse_fstring_error("{a:{b}"), UnclosedLbrace); - assert_eq!(parse_fstring_error("{"), UnclosedLbrace); - - assert_eq!(parse_fstring_error("{}"), EmptyExpression); - - // TODO: check for InvalidExpression enum? - assert!(parse_fstring("{class}").is_err()); - } - - #[test] - fn test_parse_fstring_not_equals() { - let source = "{1 != 2}"; - let parse_ast = parse_fstring(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_equals() { - let source = "{42 == 42}"; - let parse_ast = parse_fstring(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_selfdoc_prec_space() { - let source = "{x =}"; - let parse_ast = parse_fstring(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_selfdoc_trailing_space() { - let source = "{x= }"; - let parse_ast = parse_fstring(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } - - #[test] - fn test_parse_fstring_yield_expr() { - let source = "{yield}"; - let parse_ast = parse_fstring(source).unwrap(); - insta::assert_debug_snapshot!(parse_ast); - } -}