From faec9372f92b430d3aa542f6530e5198cb0aa54c Mon Sep 17 00:00:00 2001 From: harupy Date: Fri, 30 Dec 2022 21:39:29 +0900 Subject: [PATCH] Fix FormattedValue location --- parser/python.lalrpop | 14 +- parser/src/error.rs | 9 + parser/src/fstring.rs | 1 + parser/src/lexer.rs | 312 ++-------- parser/src/lib.rs | 2 +- parser/src/parser.rs | 23 +- ...er__string__tests__double_quoted_byte.snap | 297 +++++++++ ...g__tests__escape_char_in_byte_literal.snap | 51 ++ ...n_parser__string__tests__escape_octet.snap | 46 ++ ...ing__tests__fstring_escaped_character.snap | 91 +++ ...tring__tests__fstring_escaped_newline.snap | 91 +++ ...ing__tests__fstring_line_continuation.snap | 91 +++ ...ing__tests__fstring_unescaped_newline.snap | 91 +++ ...tring__tests__parse_f_string_concat_3.snap | 4 +- ...er__string__tests__raw_byte_literal_1.snap | 45 ++ ...er__string__tests__raw_byte_literal_2.snap | 43 ++ ...on_parser__string__tests__raw_fstring.snap | 72 +++ ...er__string__tests__single_quoted_byte.snap | 297 +++++++++ ...ing__tests__triple_quoted_raw_fstring.snap | 72 +++ parser/src/string.rs | 229 +++++-- parser/src/string_parser.rs | 562 ++++++++++++++++++ parser/src/token.rs | 107 ++-- 22 files changed, 2195 insertions(+), 355 deletions(-) create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__double_quoted_byte.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__escape_char_in_byte_literal.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__escape_octet.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_character.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_newline.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__fstring_line_continuation.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__fstring_unescaped_newline.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_1.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_2.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__raw_fstring.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__single_quoted_byte.snap create mode 100644 parser/src/snapshots/rustpython_parser__string__tests__triple_quoted_raw_fstring.snap create mode 100644 parser/src/string_parser.rs diff --git a/parser/python.lalrpop b/parser/python.lalrpop index a73d835..90694a4 100644 --- a/parser/python.lalrpop +++ b/parser/python.lalrpop @@ -1339,18 +1339,11 @@ OneOrMore: Vec = { }; Constant: ast::Constant = { - => ast::Constant::Bytes(b.into_iter().flatten().collect()), => ast::Constant::Int(value), => ast::Constant::Float(value), => ast::Constant::Complex { real: s.0, imag: s.1 }, }; -Bytes: Vec = { - => { - s.into_iter().flatten().collect::>() - }, -}; - Identifier: String = => s; // Hook external lexer: @@ -1448,8 +1441,11 @@ extern { int => lexer::Tok::Int { value: }, float => lexer::Tok::Float { value: }, complex => lexer::Tok::Complex { real: , imag: }, - string => lexer::Tok::String { value: , kind: }, - bytes => lexer::Tok::Bytes { value: > }, + string => lexer::Tok::String { + value: , + kind: , + triple_quoted: + }, name => lexer::Tok::Name { name: }, "\n" => lexer::Tok::Newline, ";" => lexer::Tok::Semi, diff --git a/parser/src/error.rs b/parser/src/error.rs index 89f5366..92cb0d1 100644 --- a/parser/src/error.rs +++ b/parser/src/error.rs @@ -90,6 +90,15 @@ pub enum FStringErrorType { UnterminatedString, } +impl FStringErrorType { + pub fn to_lexical_error(self, location: Location) -> LexicalError { + LexicalError { + error: LexicalErrorType::FStringError(self), + location, + } + } +} + impl fmt::Display for FStringErrorType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { diff --git a/parser/src/fstring.rs b/parser/src/fstring.rs index 12b6bf8..27a615a 100644 --- a/parser/src/fstring.rs +++ b/parser/src/fstring.rs @@ -1,3 +1,4 @@ +// We no longer need this file use self::FStringErrorType::*; use crate::{ ast::{Constant, ConversionFlag, Expr, ExprKind, Location}, diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index 65fb374..db97561 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -2,8 +2,7 @@ //! //! This means source code is translated into separate tokens. -use super::token::StringKind; -pub use super::token::Tok; +pub use super::token::{StringKind, Tok}; use crate::ast::Location; use crate::error::{LexicalError, LexicalErrorType}; use num_bigint::BigInt; @@ -217,9 +216,6 @@ where } } -/// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798 -const MAX_UNICODE_NAME: usize = 88; - impl Lexer where T: Iterator, @@ -274,8 +270,26 @@ where // Check if we have a string: if matches!(self.window[0], Some('"' | '\'')) { + let kind = if saw_r { + if saw_b { + StringKind::RawBytes + } else if saw_f { + StringKind::RawFString + } else { + StringKind::RawString + } + } else if saw_b { + StringKind::Bytes + } else if saw_u { + StringKind::Unicode + } else if saw_f { + StringKind::FString + } else { + StringKind::String + }; + return self - .lex_string(saw_b, saw_r, saw_u, saw_f) + .lex_string(kind) .map(|(_, tok, end_pos)| (start_pos, tok, end_pos)); } } @@ -479,87 +493,7 @@ where } } - fn unicode_literal(&mut self, literal_number: usize) -> Result { - let mut p: u32 = 0u32; - let unicode_error = LexicalError { - error: LexicalErrorType::UnicodeError, - location: self.get_pos(), - }; - for i in 1..=literal_number { - match self.next_char() { - Some(c) => match c.to_digit(16) { - Some(d) => p += d << ((literal_number - i) * 4), - None => return Err(unicode_error), - }, - None => return Err(unicode_error), - } - } - match p { - 0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER), - _ => std::char::from_u32(p).ok_or(unicode_error), - } - } - - fn parse_octet(&mut self, first: char) -> char { - let mut octet_content = String::new(); - octet_content.push(first); - while octet_content.len() < 3 { - if let Some('0'..='7') = self.window[0] { - octet_content.push(self.next_char().unwrap()) - } else { - break; - } - } - let value = u32::from_str_radix(&octet_content, 8).unwrap(); - char::from_u32(value).unwrap() - } - - fn parse_unicode_name(&mut self) -> Result { - let start_pos = self.get_pos(); - match self.next_char() { - Some('{') => {} - _ => { - return Err(LexicalError { - error: LexicalErrorType::StringError, - location: start_pos, - }) - } - } - let start_pos = self.get_pos(); - let mut name = String::new(); - loop { - match self.next_char() { - Some('}') => break, - Some(c) => name.push(c), - None => { - return Err(LexicalError { - error: LexicalErrorType::StringError, - location: self.get_pos(), - }) - } - } - } - - if name.len() > MAX_UNICODE_NAME { - return Err(LexicalError { - error: LexicalErrorType::UnicodeError, - location: self.get_pos(), - }); - } - - unicode_names2::character(&name).ok_or(LexicalError { - error: LexicalErrorType::UnicodeError, - location: start_pos, - }) - } - - fn lex_string( - &mut self, - is_bytes: bool, - is_raw: bool, - is_unicode: bool, - is_fstring: bool, - ) -> LexResult { + fn lex_string(&mut self, kind: StringKind) -> LexResult { let start_pos = self.get_pos(); let quote_char = self.next_char().unwrap(); let mut string_content = String::new(); @@ -577,62 +511,24 @@ where loop { match self.next_char() { - Some('\\') => { - if self.window[0] == Some(quote_char) && !is_raw { - string_content.push(quote_char); - self.next_char(); - } else if is_raw { - string_content.push('\\'); - if let Some(c) = self.next_char() { - string_content.push(c) - } else { - return Err(LexicalError { - error: LexicalErrorType::StringError, - location: self.get_pos(), - }); - } - } else { - match self.next_char() { - Some('\\') => { - string_content.push('\\'); - } - Some('\'') => string_content.push('\''), - Some('\"') => string_content.push('\"'), - Some('\n') => { - // Ignore Unix EOL character - } - Some('a') => string_content.push('\x07'), - Some('b') => string_content.push('\x08'), - Some('f') => string_content.push('\x0c'), - Some('n') => { - string_content.push('\n'); - } - Some('r') => string_content.push('\r'), - Some('t') => { - string_content.push('\t'); - } - Some('v') => string_content.push('\x0b'), - Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)), - Some('x') => string_content.push(self.unicode_literal(2)?), - Some('u') if !is_bytes => string_content.push(self.unicode_literal(4)?), - Some('U') if !is_bytes => string_content.push(self.unicode_literal(8)?), - Some('N') if !is_bytes => { - string_content.push(self.parse_unicode_name()?) - } - Some(c) => { - string_content.push('\\'); - string_content.push(c); - } - None => { - return Err(LexicalError { - error: LexicalErrorType::StringError, - location: self.get_pos(), - }); - } + Some(c) => { + if c == '\\' { + if let Some(next_c) = self.next_char() { + string_content.push('\\'); + string_content.push(next_c); + continue; } } - } - Some(c) => { + + if c == '\n' && !triple_quoted { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "EOL while scanning string literal".to_owned(), + ), + location: self.get_pos(), + }); + } + if c == quote_char { if triple_quoted { // Look ahead at the next two characters; if we have two more @@ -645,19 +541,11 @@ where self.next_char(); break; } - string_content.push(c); } else { break; } - } else { - if (c == '\n' && !triple_quoted) || (is_bytes && !c.is_ascii()) { - return Err(LexicalError { - error: LexicalErrorType::Eof, - location: self.get_pos(), - }); - } - string_content.push(c); } + string_content.push(c); } None => { return Err(LexicalError { @@ -672,25 +560,11 @@ where } } let end_pos = self.get_pos(); - - let tok = if is_bytes { - Tok::Bytes { - value: string_content.chars().map(|c| c as u8).collect(), - } - } else { - let kind = if is_fstring { - StringKind::F - } else if is_unicode { - StringKind::U - } else { - StringKind::Normal - }; - Tok::String { - value: string_content, - kind, - } + let tok = Tok::String { + value: string_content, + kind, + triple_quoted, }; - Ok((start_pos, tok, end_pos)) } @@ -907,7 +781,7 @@ where self.emit(comment); } '"' | '\'' => { - let string = self.lex_string(false, false, false, false)?; + let string = self.lex_string(StringKind::String)?; self.emit(string); } '=' => { @@ -1367,15 +1241,17 @@ mod tests { fn stok(s: &str) -> Tok { Tok::String { value: s.to_owned(), - kind: StringKind::Normal, + kind: StringKind::String, + triple_quoted: false, } } - #[test] - fn test_raw_string() { - let source = "r\"\\\\\" \"\\\\\""; - let tokens = lex_source(source); - assert_eq!(tokens, vec![stok("\\\\"), stok("\\"), Tok::Newline,]); + fn raw_stok(s: &str) -> Tok { + Tok::String { + value: s.to_owned(), + kind: StringKind::RawString, + triple_quoted: false, + } } #[test] @@ -1677,13 +1553,13 @@ mod tests { vec![ stok("double"), stok("single"), - stok("can't"), - stok("\\\""), - stok("\t\r\n"), - stok("\\g"), - stok("raw\\'"), - stok("Đ"), - stok("\u{80}\u{0}a"), + stok(r"can\'t"), + stok(r#"\\\""#), + stok(r"\t\r\n"), + stok(r"\g"), + raw_stok(r"raw\'"), + stok(r"\420"), + stok(r"\200\0a"), Tok::Newline, ] ); @@ -1699,7 +1575,7 @@ mod tests { assert_eq!( tokens, vec![ - stok("abcdef"), + stok("abc\\\ndef"), Tok::Newline, ] ) @@ -1714,78 +1590,10 @@ mod tests { test_string_continuation_unix_eol: UNIX_EOL, } - #[test] - fn test_single_quoted_byte() { - // single quote - let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##; - let tokens = lex_source(source); - let res = (0..=255).collect::>(); - assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]); - } - - #[test] - fn test_double_quoted_byte() { - // double quote - let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##; - let tokens = lex_source(source); - let res = (0..=255).collect::>(); - assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]); - } - - #[test] - fn test_escape_char_in_byte_literal() { - // backslash does not escape - let source = r##"b"omkmok\Xaa""##; - let tokens = lex_source(source); - let res = vec![111, 109, 107, 109, 111, 107, 92, 88, 97, 97]; - assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]); - } - - #[test] - fn test_raw_byte_literal() { - let source = r"rb'\x1z'"; - let tokens = lex_source(source); - assert_eq!( - tokens, - vec![ - Tok::Bytes { - value: b"\\x1z".to_vec() - }, - Tok::Newline - ] - ); - let source = r"rb'\\'"; - let tokens = lex_source(source); - assert_eq!( - tokens, - vec![ - Tok::Bytes { - value: b"\\\\".to_vec() - }, - Tok::Newline - ] - ) - } - - #[test] - fn test_escape_octet() { - let source = r##"b'\43a\4\1234'"##; - let tokens = lex_source(source); - assert_eq!( - tokens, - vec![ - Tok::Bytes { - value: b"#a\x04S4".to_vec() - }, - Tok::Newline - ] - ) - } - #[test] fn test_escape_unicode_name() { let source = r#""\N{EN SPACE}""#; let tokens = lex_source(source); - assert_eq!(tokens, vec![stok("\u{2002}"), Tok::Newline]) + assert_eq!(tokens, vec![stok(r"\N{EN SPACE}"), Tok::Newline]) } } diff --git a/parser/src/lib.rs b/parser/src/lib.rs index 66e5653..d266d2a 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -23,11 +23,11 @@ extern crate log; pub use rustpython_ast as ast; pub mod error; -mod fstring; mod function; pub mod lexer; pub mod mode; pub mod parser; +mod string_parser; #[rustfmt::skip] mod python; mod context; diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 705fb99..6dcbacc 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -8,6 +8,7 @@ use crate::lexer::{LexResult, Tok}; pub use crate::mode::Mode; use crate::{ast, error::ParseError, lexer, python}; +use ast::Location; use itertools::Itertools; use std::iter; @@ -65,7 +66,15 @@ pub fn parse_program(source: &str, source_path: &str) -> Result Result { - parse(source, Mode::Expression, path).map(|top| match top { + parse_expression_located(source, path, Location::new(1, 0)) +} + +pub fn parse_expression_located( + source: &str, + path: &str, + location: Location, +) -> Result { + parse_located(source, Mode::Expression, path, location).map(|top| match top { ast::Mod::Expression { body } => *body, _ => unreachable!(), }) @@ -73,7 +82,17 @@ pub fn parse_expression(source: &str, path: &str) -> Result Result { - let lxr = lexer::make_tokenizer(source); + parse_located(source, mode, source_path, Location::new(1, 0)) +} + +// Parse a given source code from a given location +pub fn parse_located( + source: &str, + mode: Mode, + source_path: &str, + location: Location, +) -> Result { + let lxr = lexer::make_tokenizer_located(source, location); let marker_token = (Default::default(), mode.to_marker(), Default::default()); let tokenizer = iter::once(Ok(marker_token)) .chain(lxr) diff --git a/parser/src/snapshots/rustpython_parser__string__tests__double_quoted_byte.snap b/parser/src/snapshots/rustpython_parser__string__tests__double_quoted_byte.snap new file mode 100644 index 0000000..0d8c8c9 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__double_quoted_byte.snap @@ -0,0 +1,297 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 738, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 738, + }, + ), + custom: (), + node: Constant { + value: Bytes( + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + ], + ), + kind: None, + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__escape_char_in_byte_literal.snap b/parser/src/snapshots/rustpython_parser__string__tests__escape_char_in_byte_literal.snap new file mode 100644 index 0000000..98fc3c7 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__escape_char_in_byte_literal.snap @@ -0,0 +1,51 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 13, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 13, + }, + ), + custom: (), + node: Constant { + value: Bytes( + [ + 111, + 109, + 107, + 109, + 111, + 107, + 92, + 88, + 97, + 97, + ], + ), + kind: None, + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__escape_octet.snap b/parser/src/snapshots/rustpython_parser__string__tests__escape_octet.snap new file mode 100644 index 0000000..677e3f9 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__escape_octet.snap @@ -0,0 +1,46 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 14, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 14, + }, + ), + custom: (), + node: Constant { + value: Bytes( + [ + 35, + 97, + 4, + 83, + 52, + ], + ), + kind: None, + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_character.snap b/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_character.snap new file mode 100644 index 0000000..7ab1247 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_character.snap @@ -0,0 +1,91 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: JoinedStr { + values: [ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: Constant { + value: Str( + "\\", + ), + kind: None, + }, + }, + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: FormattedValue { + value: Located { + location: Location { + row: 1, + column: 5, + }, + end_location: Some( + Location { + row: 1, + column: 6, + }, + ), + custom: (), + node: Name { + id: "x", + ctx: Load, + }, + }, + conversion: 0, + format_spec: None, + }, + }, + ], + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_newline.snap b/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_newline.snap new file mode 100644 index 0000000..8359845 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__fstring_escaped_newline.snap @@ -0,0 +1,91 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: JoinedStr { + values: [ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: Constant { + value: Str( + "\n", + ), + kind: None, + }, + }, + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: FormattedValue { + value: Located { + location: Location { + row: 1, + column: 5, + }, + end_location: Some( + Location { + row: 1, + column: 6, + }, + ), + custom: (), + node: Name { + id: "x", + ctx: Load, + }, + }, + conversion: 0, + format_spec: None, + }, + }, + ], + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__fstring_line_continuation.snap b/parser/src/snapshots/rustpython_parser__string__tests__fstring_line_continuation.snap new file mode 100644 index 0000000..2f6167d --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__fstring_line_continuation.snap @@ -0,0 +1,91 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 4, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 4, + }, + ), + custom: (), + node: JoinedStr { + values: [ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 4, + }, + ), + custom: (), + node: Constant { + value: Str( + "\\\n", + ), + kind: None, + }, + }, + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 4, + }, + ), + custom: (), + node: FormattedValue { + value: Located { + location: Location { + row: 2, + column: 1, + }, + end_location: Some( + Location { + row: 2, + column: 2, + }, + ), + custom: (), + node: Name { + id: "x", + ctx: Load, + }, + }, + conversion: 0, + format_spec: None, + }, + }, + ], + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__fstring_unescaped_newline.snap b/parser/src/snapshots/rustpython_parser__string__tests__fstring_unescaped_newline.snap new file mode 100644 index 0000000..b44c329 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__fstring_unescaped_newline.snap @@ -0,0 +1,91 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 6, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 6, + }, + ), + custom: (), + node: JoinedStr { + values: [ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 6, + }, + ), + custom: (), + node: Constant { + value: Str( + "\n", + ), + kind: None, + }, + }, + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 2, + column: 6, + }, + ), + custom: (), + node: FormattedValue { + value: Located { + location: Location { + row: 2, + column: 1, + }, + end_location: Some( + Location { + row: 2, + column: 2, + }, + ), + custom: (), + node: Name { + id: "x", + ctx: Load, + }, + }, + conversion: 0, + format_spec: None, + }, + }, + ], + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__parse_f_string_concat_3.snap b/parser/src/snapshots/rustpython_parser__string__tests__parse_f_string_concat_3.snap index 87b9212..0fdc8e8 100644 --- a/parser/src/snapshots/rustpython_parser__string__tests__parse_f_string_concat_3.snap +++ b/parser/src/snapshots/rustpython_parser__string__tests__parse_f_string_concat_3.snap @@ -65,12 +65,12 @@ expression: parse_ast value: Located { location: Location { row: 1, - column: 1, + column: 17, }, end_location: Some( Location { row: 1, - column: 4, + column: 20, }, ), custom: (), diff --git a/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_1.snap b/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_1.snap new file mode 100644 index 0000000..14daedd --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_1.snap @@ -0,0 +1,45 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 8, + }, + ), + custom: (), + node: Constant { + value: Bytes( + [ + 92, + 120, + 49, + 122, + ], + ), + kind: None, + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_2.snap b/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_2.snap new file mode 100644 index 0000000..d34d8c8 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__raw_byte_literal_2.snap @@ -0,0 +1,43 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 6, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 6, + }, + ), + custom: (), + node: Constant { + value: Bytes( + [ + 92, + 92, + ], + ), + kind: None, + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__raw_fstring.snap b/parser/src/snapshots/rustpython_parser__string__tests__raw_fstring.snap new file mode 100644 index 0000000..e53b861 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__raw_fstring.snap @@ -0,0 +1,72 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 7, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 7, + }, + ), + custom: (), + node: JoinedStr { + values: [ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 7, + }, + ), + custom: (), + node: FormattedValue { + value: Located { + location: Location { + row: 1, + column: 4, + }, + end_location: Some( + Location { + row: 1, + column: 5, + }, + ), + custom: (), + node: Name { + id: "x", + ctx: Load, + }, + }, + conversion: 0, + format_spec: None, + }, + }, + ], + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__single_quoted_byte.snap b/parser/src/snapshots/rustpython_parser__string__tests__single_quoted_byte.snap new file mode 100644 index 0000000..0d8c8c9 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__single_quoted_byte.snap @@ -0,0 +1,297 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 738, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 738, + }, + ), + custom: (), + node: Constant { + value: Bytes( + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, + ], + ), + kind: None, + }, + }, + }, + }, +] diff --git a/parser/src/snapshots/rustpython_parser__string__tests__triple_quoted_raw_fstring.snap b/parser/src/snapshots/rustpython_parser__string__tests__triple_quoted_raw_fstring.snap new file mode 100644 index 0000000..19975d5 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__string__tests__triple_quoted_raw_fstring.snap @@ -0,0 +1,72 @@ +--- +source: compiler/parser/src/string.rs +expression: parse_ast +--- +[ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 11, + }, + ), + custom: (), + node: Expr { + value: Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 11, + }, + ), + custom: (), + node: JoinedStr { + values: [ + Located { + location: Location { + row: 1, + column: 0, + }, + end_location: Some( + Location { + row: 1, + column: 11, + }, + ), + custom: (), + node: FormattedValue { + value: Located { + location: Location { + row: 1, + column: 6, + }, + end_location: Some( + Location { + row: 1, + column: 7, + }, + ), + custom: (), + node: Name { + id: "x", + ctx: Load, + }, + }, + conversion: 0, + format_spec: None, + }, + }, + ], + }, + }, + }, + }, +] diff --git a/parser/src/string.rs b/parser/src/string.rs index b80c629..817d0e8 100644 --- a/parser/src/string.rs +++ b/parser/src/string.rs @@ -1,35 +1,79 @@ use crate::{ ast::{Constant, Expr, ExprKind, Location}, error::{LexicalError, LexicalErrorType}, - fstring::parse_located_fstring, + string_parser::parse_string, token::StringKind, }; use itertools::Itertools; pub fn parse_strings( - values: Vec<(Location, (String, StringKind), Location)>, + values: Vec<(Location, (String, StringKind, bool), Location)>, ) -> Result { // Preserve the initial location and kind. let initial_start = values[0].0; let last_end = values.last().unwrap().2; - let initial_kind = (values[0].1 .1 == StringKind::U).then(|| "u".to_owned()); + let initial_kind = (values[0].1 .1 == StringKind::Unicode).then(|| "u".to_owned()); + let has_fstring = values.iter().any(|(_, (_, kind, ..), _)| kind.is_fstring()); + let num_bytes = values + .iter() + .filter(|(_, (_, kind, ..), _)| kind.is_bytes()) + .count(); + let has_bytes = num_bytes > 0; - // Optimization: fast-track the common case of a single string. - if matches!(&*values, [(_, (_, StringKind::Normal | StringKind::U), _)]) { - let value = values.into_iter().last().unwrap().1 .0; + if has_bytes && num_bytes < values.len() { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "cannot mix bytes and nonbytes literals".to_owned(), + ), + location: initial_start, + }); + } + + if has_bytes { + let mut content: Vec = vec![]; + for (start, (source, kind, triple_quoted), end) in values { + for value in parse_string(&source, kind, triple_quoted, start, end)? { + match value.node { + ExprKind::Constant { + value: Constant::Bytes(value), + .. + } => content.extend(value), + _ => unreachable!("Unexpected non-bytes expression."), + } + } + } return Ok(Expr::new( initial_start, last_end, ExprKind::Constant { - value: Constant::Str(value), - kind: initial_kind, + value: Constant::Bytes(content), + kind: None, }, )); } - // Determine whether the list of values contains any f-strings. (If not, we can return a - // single Constant at the end, rather than a JoinedStr.) - let mut has_fstring = false; + if !has_fstring { + let mut content: Vec = vec![]; + for (start, (source, kind, triple_quoted), end) in values { + for value in parse_string(&source, kind, triple_quoted, start, end)? { + match value.node { + ExprKind::Constant { + value: Constant::Str(value), + .. + } => content.push(value), + _ => unreachable!("Unexpected non-string expression."), + } + } + } + return Ok(Expr::new( + initial_start, + last_end, + ExprKind::Constant { + value: Constant::Str(content.join("")), + kind: initial_kind, + }, + )); + } // De-duplicate adjacent constants. let mut deduped: Vec = vec![]; @@ -46,34 +90,20 @@ pub fn parse_strings( ) }; - for (start, (string, string_kind), end) in values { - match string_kind { - StringKind::Normal | StringKind::U => current.push(string), - StringKind::F => { - has_fstring = true; - for value in - parse_located_fstring(&string, start, end).map_err(|e| LexicalError { - location: start, - error: LexicalErrorType::FStringError(e.error), - })? - { - match value.node { - ExprKind::FormattedValue { .. } => { - if !current.is_empty() { - deduped.push(take_current(&mut current)); - } - deduped.push(value) - } - ExprKind::Constant { value, .. } => { - if let Constant::Str(value) = value { - current.push(value); - } else { - unreachable!("Unexpected non-string constant."); - } - } - _ => unreachable!("Unexpected non-string expression."), + for (start, (source, kind, triple_quoted), end) in values { + for value in parse_string(&source, kind, triple_quoted, start, end)? { + match value.node { + ExprKind::FormattedValue { .. } => { + if !current.is_empty() { + deduped.push(take_current(&mut current)); } + deduped.push(value) } + ExprKind::Constant { + value: Constant::Str(value), + .. + } => current.push(value), + _ => unreachable!("Unexpected non-string expression."), } } } @@ -101,64 +131,153 @@ mod tests { #[test] fn test_parse_string_concat() { - let source = String::from("'Hello ' 'world'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "'Hello ' 'world'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_u_string_concat_1() { - let source = String::from("'Hello ' u'world'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "'Hello ' u'world'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_u_string_concat_2() { - let source = String::from("u'Hello ' 'world'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "u'Hello ' 'world'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_f_string_concat_1() { - let source = String::from("'Hello ' f'world'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "'Hello ' f'world'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_f_string_concat_2() { - let source = String::from("'Hello ' f'world'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "'Hello ' f'world'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_f_string_concat_3() { - let source = String::from("'Hello ' f'world{\"!\"}'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "'Hello ' f'world{\"!\"}'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_u_f_string_concat_1() { - let source = String::from("u'Hello ' f'world'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "u'Hello ' f'world'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_u_f_string_concat_2() { - let source = String::from("u'Hello ' f'world' '!'"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "u'Hello ' f'world' '!'"; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } #[test] fn test_parse_string_triple_quotes_with_kind() { - let source = String::from("u'''Hello, world!'''"); - let parse_ast = parse_program(&source, "").unwrap(); + let source = "u'''Hello, world!'''"; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_single_quoted_byte() { + // single quote + let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_double_quoted_byte() { + // double quote + let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_escape_char_in_byte_literal() { + // backslash does not escape + let source = r##"b"omkmok\Xaa""##; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_raw_byte_literal_1() { + let source = r"rb'\x1z'"; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_raw_byte_literal_2() { + let source = r"rb'\\'"; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_escape_octet() { + let source = r##"b'\43a\4\1234'"##; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_escaped_newline() { + let source = r#"f"\n{x}""#; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_unescaped_newline() { + let source = r#"f""" +{x}""""#; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_escaped_character() { + let source = r#"f"\\{x}""#; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_raw_fstring() { + let source = r#"rf"{x}""#; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_triple_quoted_raw_fstring() { + let source = r#"rf"""{x}""""#; + let parse_ast = parse_program(source, "").unwrap(); + insta::assert_debug_snapshot!(parse_ast); + } + + #[test] + fn test_fstring_line_continuation() { + let source = r#"rf"\ +{x}""#; + let parse_ast = parse_program(source, "").unwrap(); insta::assert_debug_snapshot!(parse_ast); } } diff --git a/parser/src/string_parser.rs b/parser/src/string_parser.rs new file mode 100644 index 0000000..610951a --- /dev/null +++ b/parser/src/string_parser.rs @@ -0,0 +1,562 @@ +use self::FStringErrorType::*; +use crate::{ + ast::{Constant, ConversionFlag, Expr, ExprKind, Location}, + error::{FStringErrorType, LexicalError, LexicalErrorType, ParseError}, + parser::parse_expression_located, + token::StringKind, +}; +use std::{iter, mem, str}; + +/// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798 +pub const MAX_UNICODE_NAME: usize = 88; + +pub struct StringParser<'a> { + chars: iter::Peekable>, + kind: StringKind, + str_start: Location, + str_end: Location, + location: Location, +} + +impl<'a> StringParser<'a> { + pub fn new( + source: &'a str, + kind: StringKind, + triple_quoted: bool, + str_start: Location, + str_end: Location, + ) -> Self { + let offset = kind.to_string().len() + if triple_quoted { 3 } else { 1 }; + Self { + chars: source.chars().peekable(), + kind, + str_start, + str_end, + location: Location::new(str_start.row(), str_start.column() + offset), + } + } + + fn next_char(&mut self) -> Option { + let Some(c) = self.chars.next() else { + return None + }; + if c == '\n' { + self.location.newline(); + } else { + self.location.go_right(); + } + Some(c) + } + + fn peek(&mut self) -> Option<&char> { + self.chars.peek() + } + + fn get_pos(&self) -> Location { + self.location + } + + #[inline] + fn expr(&self, node: ExprKind) -> Expr { + Expr::new(self.str_start, self.str_end, node) + } + + fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { + let mut p: u32 = 0u32; + let unicode_error = LexicalError { + error: LexicalErrorType::UnicodeError, + location: self.get_pos(), + }; + for i in 1..=literal_number { + match self.next_char() { + Some(c) => match c.to_digit(16) { + Some(d) => p += d << ((literal_number - i) * 4), + None => return Err(unicode_error), + }, + None => return Err(unicode_error), + } + } + match p { + 0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER), + _ => std::char::from_u32(p).ok_or(unicode_error), + } + } + + fn parse_octet(&mut self, first: char) -> char { + let mut octet_content = String::new(); + octet_content.push(first); + while octet_content.len() < 3 { + if let Some('0'..='7') = self.peek() { + octet_content.push(self.next_char().unwrap()) + } else { + break; + } + } + let value = u32::from_str_radix(&octet_content, 8).unwrap(); + char::from_u32(value).unwrap() + } + + fn parse_unicode_name(&mut self) -> Result { + let start_pos = self.get_pos(); + match self.next_char() { + Some('{') => {} + _ => { + return Err(LexicalError { + error: LexicalErrorType::StringError, + location: start_pos, + }) + } + } + let start_pos = self.get_pos(); + let mut name = String::new(); + loop { + match self.next_char() { + Some('}') => break, + Some(c) => name.push(c), + None => { + return Err(LexicalError { + error: LexicalErrorType::StringError, + location: self.get_pos(), + }) + } + } + } + + if name.len() > MAX_UNICODE_NAME { + return Err(LexicalError { + error: LexicalErrorType::UnicodeError, + location: self.get_pos(), + }); + } + + unicode_names2::character(&name).ok_or(LexicalError { + error: LexicalErrorType::UnicodeError, + location: start_pos, + }) + } + + fn parse_escaped_char(&mut self) -> Result { + match self.next_char() { + Some(c) => Ok(match c { + '\\' => '\\'.to_string(), + '\'' => '\''.to_string(), + '\"' => '"'.to_string(), + '\n' => "".to_string(), + 'a' => '\x07'.to_string(), + 'b' => '\x08'.to_string(), + 'f' => '\x0c'.to_string(), + 'n' => '\n'.to_string(), + 'r' => '\r'.to_string(), + 't' => '\t'.to_string(), + 'v' => '\x0b'.to_string(), + o @ '0'..='7' => self.parse_octet(o).to_string(), + 'x' => self.parse_unicode_literal(2)?.to_string(), + 'u' if !self.kind.is_bytes() => self.parse_unicode_literal(4)?.to_string(), + 'U' if !self.kind.is_bytes() => self.parse_unicode_literal(8)?.to_string(), + 'N' if !self.kind.is_bytes() => self.parse_unicode_name()?.to_string(), + c => { + if self.kind.is_bytes() && !c.is_ascii() { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters".to_owned(), + ), + location: self.get_pos(), + }); + } + format!("\\{c}") + } + }), + None => Err(LexicalError { + error: LexicalErrorType::StringError, + location: self.get_pos(), + }), + } + } + + fn parse_formatted_value(&mut self, nested: u8) -> Result, LexicalError> { + let mut expression = String::new(); + let mut spec = None; + let mut delims = Vec::new(); + let mut conversion = ConversionFlag::None; + let mut self_documenting = false; + let mut trailing_seq = String::new(); + let location = self.get_pos(); + + while let Some(ch) = self.next_char() { + match ch { + // can be integrated better with the remaining code, but as a starting point ok + // in general I would do here a tokenizing of the fstrings to omit this peeking. + '!' if self.peek() == Some(&'=') => { + expression.push_str("!="); + self.next_char(); + } + + '=' if self.peek() == Some(&'=') => { + expression.push_str("=="); + self.next_char(); + } + + '>' if self.peek() == Some(&'=') => { + expression.push_str(">="); + self.next_char(); + } + + '<' if self.peek() == Some(&'=') => { + expression.push_str("<="); + self.next_char(); + } + + '!' if delims.is_empty() && self.peek() != Some(&'=') => { + if expression.trim().is_empty() { + return Err(EmptyExpression.to_lexical_error(self.get_pos())); + } + + conversion = match self.next_char() { + Some('s') => ConversionFlag::Str, + Some('a') => ConversionFlag::Ascii, + Some('r') => ConversionFlag::Repr, + Some(_) => { + return Err(if expression.trim().is_empty() { + EmptyExpression.to_lexical_error(self.get_pos()) + } else { + InvalidConversionFlag.to_lexical_error(self.get_pos()) + }); + } + None => { + return Err(if expression.trim().is_empty() { + EmptyExpression.to_lexical_error(self.get_pos()) + } else { + UnclosedLbrace.to_lexical_error(self.get_pos()) + }); + } + }; + + if let Some(&peek) = self.peek() { + if peek != '}' && peek != ':' { + return Err(if expression.trim().is_empty() { + EmptyExpression.to_lexical_error(self.get_pos()) + } else { + UnclosedLbrace.to_lexical_error(self.get_pos()) + }); + } + } else { + return Err(if expression.trim().is_empty() { + EmptyExpression.to_lexical_error(self.get_pos()) + } else { + UnclosedLbrace.to_lexical_error(self.get_pos()) + }); + } + } + + // match a python 3.8 self documenting expression + // format '{' PYTHON_EXPRESSION '=' FORMAT_SPECIFIER? '}' + '=' if self.peek() != Some(&'=') && delims.is_empty() => { + self_documenting = true; + } + + ':' if delims.is_empty() => { + let parsed_spec = self.parse_spec(nested)?; + + spec = Some(Box::new(self.expr(ExprKind::JoinedStr { + values: parsed_spec, + }))); + } + '(' | '{' | '[' => { + expression.push(ch); + delims.push(ch); + } + ')' => { + let last_delim = delims.pop(); + match last_delim { + Some('(') => { + expression.push(ch); + } + Some(c) => { + return Err( + MismatchedDelimiter(c, ')').to_lexical_error(self.get_pos()) + ); + } + None => { + return Err(Unmatched(')').to_lexical_error(self.get_pos())); + } + } + } + ']' => { + let last_delim = delims.pop(); + match last_delim { + Some('[') => { + expression.push(ch); + } + Some(c) => { + return Err( + MismatchedDelimiter(c, ']').to_lexical_error(self.get_pos()) + ); + } + None => { + return Err(Unmatched(']').to_lexical_error(self.get_pos())); + } + } + } + '}' if !delims.is_empty() => { + let last_delim = delims.pop(); + match last_delim { + Some('{') => { + expression.push(ch); + } + Some(c) => { + return Err(MismatchedDelimiter(c, '}').to_lexical_error(self.get_pos())) + } + None => {} + } + } + '}' => { + if expression.trim().is_empty() { + return Err(EmptyExpression.to_lexical_error(self.get_pos())); + } + + let ret = if !self_documenting { + vec![self.expr(ExprKind::FormattedValue { + value: Box::new(parse_fstring_expr(&expression, location).map_err( + |e| { + InvalidExpression(Box::new(e.error)) + .to_lexical_error(self.get_pos()) + }, + )?), + conversion: conversion as _, + format_spec: spec, + })] + } else { + vec![ + self.expr(ExprKind::Constant { + value: Constant::Str(expression.to_owned() + "="), + kind: None, + }), + self.expr(ExprKind::Constant { + value: trailing_seq.into(), + kind: None, + }), + self.expr(ExprKind::FormattedValue { + value: Box::new( + parse_fstring_expr(&expression, location).map_err(|e| { + InvalidExpression(Box::new(e.error)) + .to_lexical_error(self.get_pos()) + })?, + ), + conversion: (if conversion == ConversionFlag::None && spec.is_none() + { + ConversionFlag::Repr + } else { + conversion + }) as _, + format_spec: spec, + }), + ] + }; + return Ok(ret); + } + '"' | '\'' => { + expression.push(ch); + loop { + let Some(c) = self.next_char() else { + return Err(UnterminatedString.to_lexical_error(self.get_pos())); + }; + expression.push(c); + if c == ch { + break; + } + } + } + ' ' if self_documenting => { + trailing_seq.push(ch); + } + '\\' => return Err(ExpressionCannotInclude('\\').to_lexical_error(self.get_pos())), + _ => { + if self_documenting { + return Err(UnclosedLbrace.to_lexical_error(self.get_pos())); + } + + expression.push(ch); + } + } + } + Err(if expression.trim().is_empty() { + EmptyExpression.to_lexical_error(self.get_pos()) + } else { + UnclosedLbrace.to_lexical_error(self.get_pos()) + }) + } + + fn parse_spec(&mut self, nested: u8) -> Result, LexicalError> { + let mut spec_constructor = Vec::new(); + let mut constant_piece = String::new(); + while let Some(&next) = self.peek() { + match next { + '{' => { + if !constant_piece.is_empty() { + spec_constructor.push(self.expr(ExprKind::Constant { + value: constant_piece.to_owned().into(), + kind: None, + })); + constant_piece.clear(); + } + let parsed_expr = self.parse_fstring(nested + 1)?; + spec_constructor.extend(parsed_expr); + continue; + } + '}' => { + break; + } + _ => { + constant_piece.push(next); + } + } + self.next_char(); + } + if !constant_piece.is_empty() { + spec_constructor.push(self.expr(ExprKind::Constant { + value: constant_piece.to_owned().into(), + kind: None, + })); + constant_piece.clear(); + } + Ok(spec_constructor) + } + + fn parse_fstring(&mut self, nested: u8) -> Result, LexicalError> { + if nested >= 2 { + return Err(ExpressionNestedTooDeeply.to_lexical_error(self.get_pos())); + } + + let mut content = String::new(); + let mut values = vec![]; + + while let Some(&ch) = self.peek() { + match ch { + '{' => { + self.next_char(); + if nested == 0 { + match self.peek() { + Some('{') => { + self.next_char(); + content.push('{'); + continue; + } + None => return Err(UnclosedLbrace.to_lexical_error(self.get_pos())), + _ => {} + } + } + if !content.is_empty() { + values.push(self.expr(ExprKind::Constant { + value: mem::take(&mut content).into(), + kind: None, + })); + } + + let parsed_values = self.parse_formatted_value(nested)?; + values.extend(parsed_values); + } + '}' => { + if nested > 0 { + break; + } + self.next_char(); + if let Some('}') = self.peek() { + self.next_char(); + content.push('}'); + } else { + return Err(SingleRbrace.to_lexical_error(self.get_pos())); + } + } + '\\' if !self.kind.is_raw() => { + self.next_char(); + content.push_str(&self.parse_escaped_char()?); + } + _ => { + content.push(ch); + self.next_char(); + } + } + } + + if !content.is_empty() { + values.push(self.expr(ExprKind::Constant { + value: content.into(), + kind: None, + })) + } + + Ok(values) + } + + pub fn parse_bytes(&mut self) -> Result { + let mut content = String::new(); + while let Some(ch) = self.next_char() { + match ch { + '\\' if !self.kind.is_raw() => { + content.push_str(&self.parse_escaped_char()?); + } + ch => { + if !ch.is_ascii() { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters".to_string(), + ), + location: self.get_pos(), + }); + } + content.push(ch); + } + } + } + + Ok(self.expr(ExprKind::Constant { + value: Constant::Bytes(content.chars().map(|c| c as u8).collect()), + kind: None, + })) + } + + pub fn parse_string(&mut self) -> Result { + let mut content = String::new(); + while let Some(ch) = self.next_char() { + match ch { + '\\' if !self.kind.is_raw() => { + content.push_str(&self.parse_escaped_char()?); + } + ch => content.push(ch), + } + } + Ok(self.expr(ExprKind::Constant { + value: Constant::Str(content), + kind: self.kind.is_unicode().then(|| "u".to_string()), + })) + } + + pub fn parse(&mut self) -> Result, LexicalError> { + if self.kind.is_fstring() { + self.parse_fstring(0) + } else if self.kind.is_bytes() { + self.parse_bytes().map(|expr| vec![expr]) + } else { + self.parse_string().map(|expr| vec![expr]) + } + } +} + +fn parse_fstring_expr(source: &str, location: Location) -> Result { + let fstring_body = format!("({source})"); + parse_expression_located( + &fstring_body, + "", + Location::new(location.row(), location.column() - 1), + ) +} + +pub fn parse_string( + source: &str, + kind: StringKind, + triple_quoted: bool, + start: Location, + end: Location, +) -> Result, LexicalError> { + StringParser::new(source, kind, triple_quoted, start, end).parse() +} diff --git a/parser/src/token.rs b/parser/src/token.rs index 14ffece..ce48410 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -1,17 +1,29 @@ //! Different token definitions. //! Loosely based on token.h from CPython source: use num_bigint::BigInt; -use std::fmt::{self, Write}; +use std::fmt; /// Python source code can be tokenized in a sequence of these tokens. #[derive(Clone, Debug, PartialEq)] pub enum Tok { - Name { name: String }, - Int { value: BigInt }, - Float { value: f64 }, - Complex { real: f64, imag: f64 }, - String { value: String, kind: StringKind }, - Bytes { value: Vec }, + Name { + name: String, + }, + Int { + value: BigInt, + }, + Float { + value: f64, + }, + Complex { + real: f64, + imag: f64, + }, + String { + value: String, + kind: StringKind, + triple_quoted: bool, + }, Newline, Indent, Dedent, @@ -107,13 +119,6 @@ pub enum Tok { Yield, } -#[derive(PartialEq, Eq, Debug, Clone)] -pub enum StringKind { - Normal, - F, - U, -} - impl fmt::Display for Tok { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use Tok::*; @@ -122,26 +127,13 @@ impl fmt::Display for Tok { Int { value } => write!(f, "'{value}'"), Float { value } => write!(f, "'{value}'"), Complex { real, imag } => write!(f, "{real}j{imag}"), - String { value, kind } => { - match kind { - StringKind::F => f.write_str("f")?, - StringKind::U => f.write_str("u")?, - StringKind::Normal => {} - } - write!(f, "{value:?}") - } - Bytes { value } => { - write!(f, "b\"")?; - for i in value { - match i { - 9 => f.write_str("\\t")?, - 10 => f.write_str("\\n")?, - 13 => f.write_str("\\r")?, - 32..=126 => f.write_char(*i as char)?, - _ => write!(f, "\\x{i:02x}")?, - } - } - f.write_str("\"") + String { + value, + kind, + triple_quoted, + } => { + let quotes = "\"".repeat(if *triple_quoted { 3 } else { 1 }); + write!(f, "{kind}{quotes}{value}{quotes}") } Newline => f.write_str("Newline"), Indent => f.write_str("Indent"), @@ -236,3 +228,50 @@ impl fmt::Display for Tok { } } } + +#[derive(PartialEq, Eq, Debug, Clone)] +pub enum StringKind { + String, + FString, + Bytes, + RawString, + RawFString, + RawBytes, + Unicode, +} + +impl fmt::Display for StringKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use StringKind::*; + match self { + String => f.write_str(""), + FString => f.write_str("f"), + Bytes => f.write_str("b"), + RawString => f.write_str("r"), + RawFString => f.write_str("rf"), + RawBytes => f.write_str("rb"), + Unicode => f.write_str("u"), + } + } +} + +impl StringKind { + pub fn is_raw(&self) -> bool { + use StringKind::{RawBytes, RawFString, RawString}; + matches!(self, RawString | RawFString | RawBytes) + } + + pub fn is_fstring(&self) -> bool { + use StringKind::{FString, RawFString}; + matches!(self, FString | RawFString) + } + + pub fn is_bytes(&self) -> bool { + use StringKind::{Bytes, RawBytes}; + matches!(self, Bytes | RawBytes) + } + + pub fn is_unicode(&self) -> bool { + matches!(self, StringKind::Unicode) + } +}