Fix FormattedValue location

This commit is contained in:
harupy 2022-12-30 21:39:29 +09:00
parent 4e00ba2c50
commit faec9372f9
22 changed files with 2195 additions and 355 deletions

View file

@ -1339,18 +1339,11 @@ OneOrMore<T>: Vec<T> = {
};
Constant: ast::Constant = {
<b:bytes+> => ast::Constant::Bytes(b.into_iter().flatten().collect()),
<value:int> => ast::Constant::Int(value),
<value:float> => ast::Constant::Float(value),
<s:complex> => ast::Constant::Complex { real: s.0, imag: s.1 },
};
Bytes: Vec<u8> = {
<s:bytes+> => {
s.into_iter().flatten().collect::<Vec<u8>>()
},
};
Identifier: String = <s:name> => s;
// Hook external lexer:
@ -1448,8 +1441,11 @@ extern {
int => lexer::Tok::Int { value: <BigInt> },
float => lexer::Tok::Float { value: <f64> },
complex => lexer::Tok::Complex { real: <f64>, imag: <f64> },
string => lexer::Tok::String { value: <String>, kind: <StringKind> },
bytes => lexer::Tok::Bytes { value: <Vec<u8>> },
string => lexer::Tok::String {
value: <String>,
kind: <StringKind>,
triple_quoted: <bool>
},
name => lexer::Tok::Name { name: <String> },
"\n" => lexer::Tok::Newline,
";" => lexer::Tok::Semi,

View file

@ -90,6 +90,15 @@ pub enum FStringErrorType {
UnterminatedString,
}
impl FStringErrorType {
pub fn to_lexical_error(self, location: Location) -> LexicalError {
LexicalError {
error: LexicalErrorType::FStringError(self),
location,
}
}
}
impl fmt::Display for FStringErrorType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {

View file

@ -1,3 +1,4 @@
// We no longer need this file
use self::FStringErrorType::*;
use crate::{
ast::{Constant, ConversionFlag, Expr, ExprKind, Location},

View file

@ -2,8 +2,7 @@
//!
//! This means source code is translated into separate tokens.
use super::token::StringKind;
pub use super::token::Tok;
pub use super::token::{StringKind, Tok};
use crate::ast::Location;
use crate::error::{LexicalError, LexicalErrorType};
use num_bigint::BigInt;
@ -217,9 +216,6 @@ where
}
}
/// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798
const MAX_UNICODE_NAME: usize = 88;
impl<T> Lexer<T>
where
T: Iterator<Item = char>,
@ -274,8 +270,26 @@ where
// Check if we have a string:
if matches!(self.window[0], Some('"' | '\'')) {
let kind = if saw_r {
if saw_b {
StringKind::RawBytes
} else if saw_f {
StringKind::RawFString
} else {
StringKind::RawString
}
} else if saw_b {
StringKind::Bytes
} else if saw_u {
StringKind::Unicode
} else if saw_f {
StringKind::FString
} else {
StringKind::String
};
return self
.lex_string(saw_b, saw_r, saw_u, saw_f)
.lex_string(kind)
.map(|(_, tok, end_pos)| (start_pos, tok, end_pos));
}
}
@ -479,87 +493,7 @@ where
}
}
fn unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
let mut p: u32 = 0u32;
let unicode_error = LexicalError {
error: LexicalErrorType::UnicodeError,
location: self.get_pos(),
};
for i in 1..=literal_number {
match self.next_char() {
Some(c) => match c.to_digit(16) {
Some(d) => p += d << ((literal_number - i) * 4),
None => return Err(unicode_error),
},
None => return Err(unicode_error),
}
}
match p {
0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER),
_ => std::char::from_u32(p).ok_or(unicode_error),
}
}
fn parse_octet(&mut self, first: char) -> char {
let mut octet_content = String::new();
octet_content.push(first);
while octet_content.len() < 3 {
if let Some('0'..='7') = self.window[0] {
octet_content.push(self.next_char().unwrap())
} else {
break;
}
}
let value = u32::from_str_radix(&octet_content, 8).unwrap();
char::from_u32(value).unwrap()
}
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
let start_pos = self.get_pos();
match self.next_char() {
Some('{') => {}
_ => {
return Err(LexicalError {
error: LexicalErrorType::StringError,
location: start_pos,
})
}
}
let start_pos = self.get_pos();
let mut name = String::new();
loop {
match self.next_char() {
Some('}') => break,
Some(c) => name.push(c),
None => {
return Err(LexicalError {
error: LexicalErrorType::StringError,
location: self.get_pos(),
})
}
}
}
if name.len() > MAX_UNICODE_NAME {
return Err(LexicalError {
error: LexicalErrorType::UnicodeError,
location: self.get_pos(),
});
}
unicode_names2::character(&name).ok_or(LexicalError {
error: LexicalErrorType::UnicodeError,
location: start_pos,
})
}
fn lex_string(
&mut self,
is_bytes: bool,
is_raw: bool,
is_unicode: bool,
is_fstring: bool,
) -> LexResult {
fn lex_string(&mut self, kind: StringKind) -> LexResult {
let start_pos = self.get_pos();
let quote_char = self.next_char().unwrap();
let mut string_content = String::new();
@ -577,62 +511,24 @@ where
loop {
match self.next_char() {
Some('\\') => {
if self.window[0] == Some(quote_char) && !is_raw {
string_content.push(quote_char);
self.next_char();
} else if is_raw {
Some(c) => {
if c == '\\' {
if let Some(next_c) = self.next_char() {
string_content.push('\\');
if let Some(c) = self.next_char() {
string_content.push(c)
} else {
string_content.push(next_c);
continue;
}
}
if c == '\n' && !triple_quoted {
return Err(LexicalError {
error: LexicalErrorType::StringError,
error: LexicalErrorType::OtherError(
"EOL while scanning string literal".to_owned(),
),
location: self.get_pos(),
});
}
} else {
match self.next_char() {
Some('\\') => {
string_content.push('\\');
}
Some('\'') => string_content.push('\''),
Some('\"') => string_content.push('\"'),
Some('\n') => {
// Ignore Unix EOL character
}
Some('a') => string_content.push('\x07'),
Some('b') => string_content.push('\x08'),
Some('f') => string_content.push('\x0c'),
Some('n') => {
string_content.push('\n');
}
Some('r') => string_content.push('\r'),
Some('t') => {
string_content.push('\t');
}
Some('v') => string_content.push('\x0b'),
Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)),
Some('x') => string_content.push(self.unicode_literal(2)?),
Some('u') if !is_bytes => string_content.push(self.unicode_literal(4)?),
Some('U') if !is_bytes => string_content.push(self.unicode_literal(8)?),
Some('N') if !is_bytes => {
string_content.push(self.parse_unicode_name()?)
}
Some(c) => {
string_content.push('\\');
string_content.push(c);
}
None => {
return Err(LexicalError {
error: LexicalErrorType::StringError,
location: self.get_pos(),
});
}
}
}
}
Some(c) => {
if c == quote_char {
if triple_quoted {
// Look ahead at the next two characters; if we have two more
@ -645,20 +541,12 @@ where
self.next_char();
break;
}
string_content.push(c);
} else {
break;
}
} else {
if (c == '\n' && !triple_quoted) || (is_bytes && !c.is_ascii()) {
return Err(LexicalError {
error: LexicalErrorType::Eof,
location: self.get_pos(),
});
}
string_content.push(c);
}
}
None => {
return Err(LexicalError {
error: if triple_quoted {
@ -672,25 +560,11 @@ where
}
}
let end_pos = self.get_pos();
let tok = if is_bytes {
Tok::Bytes {
value: string_content.chars().map(|c| c as u8).collect(),
}
} else {
let kind = if is_fstring {
StringKind::F
} else if is_unicode {
StringKind::U
} else {
StringKind::Normal
};
Tok::String {
let tok = Tok::String {
value: string_content,
kind,
}
triple_quoted,
};
Ok((start_pos, tok, end_pos))
}
@ -907,7 +781,7 @@ where
self.emit(comment);
}
'"' | '\'' => {
let string = self.lex_string(false, false, false, false)?;
let string = self.lex_string(StringKind::String)?;
self.emit(string);
}
'=' => {
@ -1367,15 +1241,17 @@ mod tests {
fn stok(s: &str) -> Tok {
Tok::String {
value: s.to_owned(),
kind: StringKind::Normal,
kind: StringKind::String,
triple_quoted: false,
}
}
#[test]
fn test_raw_string() {
let source = "r\"\\\\\" \"\\\\\"";
let tokens = lex_source(source);
assert_eq!(tokens, vec![stok("\\\\"), stok("\\"), Tok::Newline,]);
fn raw_stok(s: &str) -> Tok {
Tok::String {
value: s.to_owned(),
kind: StringKind::RawString,
triple_quoted: false,
}
}
#[test]
@ -1677,13 +1553,13 @@ mod tests {
vec![
stok("double"),
stok("single"),
stok("can't"),
stok("\\\""),
stok("\t\r\n"),
stok("\\g"),
stok("raw\\'"),
stok("Đ"),
stok("\u{80}\u{0}a"),
stok(r"can\'t"),
stok(r#"\\\""#),
stok(r"\t\r\n"),
stok(r"\g"),
raw_stok(r"raw\'"),
stok(r"\420"),
stok(r"\200\0a"),
Tok::Newline,
]
);
@ -1699,7 +1575,7 @@ mod tests {
assert_eq!(
tokens,
vec![
stok("abcdef"),
stok("abc\\\ndef"),
Tok::Newline,
]
)
@ -1714,78 +1590,10 @@ mod tests {
test_string_continuation_unix_eol: UNIX_EOL,
}
#[test]
fn test_single_quoted_byte() {
// single quote
let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##;
let tokens = lex_source(source);
let res = (0..=255).collect::<Vec<u8>>();
assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]);
}
#[test]
fn test_double_quoted_byte() {
// double quote
let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##;
let tokens = lex_source(source);
let res = (0..=255).collect::<Vec<u8>>();
assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]);
}
#[test]
fn test_escape_char_in_byte_literal() {
// backslash does not escape
let source = r##"b"omkmok\Xaa""##;
let tokens = lex_source(source);
let res = vec![111, 109, 107, 109, 111, 107, 92, 88, 97, 97];
assert_eq!(tokens, vec![Tok::Bytes { value: res }, Tok::Newline]);
}
#[test]
fn test_raw_byte_literal() {
let source = r"rb'\x1z'";
let tokens = lex_source(source);
assert_eq!(
tokens,
vec![
Tok::Bytes {
value: b"\\x1z".to_vec()
},
Tok::Newline
]
);
let source = r"rb'\\'";
let tokens = lex_source(source);
assert_eq!(
tokens,
vec![
Tok::Bytes {
value: b"\\\\".to_vec()
},
Tok::Newline
]
)
}
#[test]
fn test_escape_octet() {
let source = r##"b'\43a\4\1234'"##;
let tokens = lex_source(source);
assert_eq!(
tokens,
vec![
Tok::Bytes {
value: b"#a\x04S4".to_vec()
},
Tok::Newline
]
)
}
#[test]
fn test_escape_unicode_name() {
let source = r#""\N{EN SPACE}""#;
let tokens = lex_source(source);
assert_eq!(tokens, vec![stok("\u{2002}"), Tok::Newline])
assert_eq!(tokens, vec![stok(r"\N{EN SPACE}"), Tok::Newline])
}
}

View file

@ -23,11 +23,11 @@ extern crate log;
pub use rustpython_ast as ast;
pub mod error;
mod fstring;
mod function;
pub mod lexer;
pub mod mode;
pub mod parser;
mod string_parser;
#[rustfmt::skip]
mod python;
mod context;

View file

@ -8,6 +8,7 @@
use crate::lexer::{LexResult, Tok};
pub use crate::mode::Mode;
use crate::{ast, error::ParseError, lexer, python};
use ast::Location;
use itertools::Itertools;
use std::iter;
@ -65,7 +66,15 @@ pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, Pars
///
/// ```
pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseError> {
parse(source, Mode::Expression, path).map(|top| match top {
parse_expression_located(source, path, Location::new(1, 0))
}
pub fn parse_expression_located(
source: &str,
path: &str,
location: Location,
) -> Result<ast::Expr, ParseError> {
parse_located(source, Mode::Expression, path, location).map(|top| match top {
ast::Mod::Expression { body } => *body,
_ => unreachable!(),
})
@ -73,7 +82,17 @@ pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseErro
// Parse a given source code
pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, ParseError> {
let lxr = lexer::make_tokenizer(source);
parse_located(source, mode, source_path, Location::new(1, 0))
}
// Parse a given source code from a given location
pub fn parse_located(
source: &str,
mode: Mode,
source_path: &str,
location: Location,
) -> Result<ast::Mod, ParseError> {
let lxr = lexer::make_tokenizer_located(source, location);
let marker_token = (Default::default(), mode.to_marker(), Default::default());
let tokenizer = iter::once(Ok(marker_token))
.chain(lxr)

View file

@ -0,0 +1,297 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 738,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 738,
},
),
custom: (),
node: Constant {
value: Bytes(
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127,
128,
129,
130,
131,
132,
133,
134,
135,
136,
137,
138,
139,
140,
141,
142,
143,
144,
145,
146,
147,
148,
149,
150,
151,
152,
153,
154,
155,
156,
157,
158,
159,
160,
161,
162,
163,
164,
165,
166,
167,
168,
169,
170,
171,
172,
173,
174,
175,
176,
177,
178,
179,
180,
181,
182,
183,
184,
185,
186,
187,
188,
189,
190,
191,
192,
193,
194,
195,
196,
197,
198,
199,
200,
201,
202,
203,
204,
205,
206,
207,
208,
209,
210,
211,
212,
213,
214,
215,
216,
217,
218,
219,
220,
221,
222,
223,
224,
225,
226,
227,
228,
229,
230,
231,
232,
233,
234,
235,
236,
237,
238,
239,
240,
241,
242,
243,
244,
245,
246,
247,
248,
249,
250,
251,
252,
253,
254,
255,
],
),
kind: None,
},
},
},
},
]

View file

@ -0,0 +1,51 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 13,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 13,
},
),
custom: (),
node: Constant {
value: Bytes(
[
111,
109,
107,
109,
111,
107,
92,
88,
97,
97,
],
),
kind: None,
},
},
},
},
]

View file

@ -0,0 +1,46 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 14,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 14,
},
),
custom: (),
node: Constant {
value: Bytes(
[
35,
97,
4,
83,
52,
],
),
kind: None,
},
},
},
},
]

View file

@ -0,0 +1,91 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: JoinedStr {
values: [
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: Constant {
value: Str(
"\\",
),
kind: None,
},
},
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: FormattedValue {
value: Located {
location: Location {
row: 1,
column: 5,
},
end_location: Some(
Location {
row: 1,
column: 6,
},
),
custom: (),
node: Name {
id: "x",
ctx: Load,
},
},
conversion: 0,
format_spec: None,
},
},
],
},
},
},
},
]

View file

@ -0,0 +1,91 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: JoinedStr {
values: [
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: Constant {
value: Str(
"\n",
),
kind: None,
},
},
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: FormattedValue {
value: Located {
location: Location {
row: 1,
column: 5,
},
end_location: Some(
Location {
row: 1,
column: 6,
},
),
custom: (),
node: Name {
id: "x",
ctx: Load,
},
},
conversion: 0,
format_spec: None,
},
},
],
},
},
},
},
]

View file

@ -0,0 +1,91 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 4,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 4,
},
),
custom: (),
node: JoinedStr {
values: [
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 4,
},
),
custom: (),
node: Constant {
value: Str(
"\\\n",
),
kind: None,
},
},
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 4,
},
),
custom: (),
node: FormattedValue {
value: Located {
location: Location {
row: 2,
column: 1,
},
end_location: Some(
Location {
row: 2,
column: 2,
},
),
custom: (),
node: Name {
id: "x",
ctx: Load,
},
},
conversion: 0,
format_spec: None,
},
},
],
},
},
},
},
]

View file

@ -0,0 +1,91 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 6,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 6,
},
),
custom: (),
node: JoinedStr {
values: [
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 6,
},
),
custom: (),
node: Constant {
value: Str(
"\n",
),
kind: None,
},
},
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 2,
column: 6,
},
),
custom: (),
node: FormattedValue {
value: Located {
location: Location {
row: 2,
column: 1,
},
end_location: Some(
Location {
row: 2,
column: 2,
},
),
custom: (),
node: Name {
id: "x",
ctx: Load,
},
},
conversion: 0,
format_spec: None,
},
},
],
},
},
},
},
]

View file

@ -65,12 +65,12 @@ expression: parse_ast
value: Located {
location: Location {
row: 1,
column: 1,
column: 17,
},
end_location: Some(
Location {
row: 1,
column: 4,
column: 20,
},
),
custom: (),

View file

@ -0,0 +1,45 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 8,
},
),
custom: (),
node: Constant {
value: Bytes(
[
92,
120,
49,
122,
],
),
kind: None,
},
},
},
},
]

View file

@ -0,0 +1,43 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 6,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 6,
},
),
custom: (),
node: Constant {
value: Bytes(
[
92,
92,
],
),
kind: None,
},
},
},
},
]

View file

@ -0,0 +1,72 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 7,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 7,
},
),
custom: (),
node: JoinedStr {
values: [
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 7,
},
),
custom: (),
node: FormattedValue {
value: Located {
location: Location {
row: 1,
column: 4,
},
end_location: Some(
Location {
row: 1,
column: 5,
},
),
custom: (),
node: Name {
id: "x",
ctx: Load,
},
},
conversion: 0,
format_spec: None,
},
},
],
},
},
},
},
]

View file

@ -0,0 +1,297 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 738,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 738,
},
),
custom: (),
node: Constant {
value: Bytes(
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
83,
84,
85,
86,
87,
88,
89,
90,
91,
92,
93,
94,
95,
96,
97,
98,
99,
100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115,
116,
117,
118,
119,
120,
121,
122,
123,
124,
125,
126,
127,
128,
129,
130,
131,
132,
133,
134,
135,
136,
137,
138,
139,
140,
141,
142,
143,
144,
145,
146,
147,
148,
149,
150,
151,
152,
153,
154,
155,
156,
157,
158,
159,
160,
161,
162,
163,
164,
165,
166,
167,
168,
169,
170,
171,
172,
173,
174,
175,
176,
177,
178,
179,
180,
181,
182,
183,
184,
185,
186,
187,
188,
189,
190,
191,
192,
193,
194,
195,
196,
197,
198,
199,
200,
201,
202,
203,
204,
205,
206,
207,
208,
209,
210,
211,
212,
213,
214,
215,
216,
217,
218,
219,
220,
221,
222,
223,
224,
225,
226,
227,
228,
229,
230,
231,
232,
233,
234,
235,
236,
237,
238,
239,
240,
241,
242,
243,
244,
245,
246,
247,
248,
249,
250,
251,
252,
253,
254,
255,
],
),
kind: None,
},
},
},
},
]

View file

@ -0,0 +1,72 @@
---
source: compiler/parser/src/string.rs
expression: parse_ast
---
[
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 11,
},
),
custom: (),
node: Expr {
value: Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 11,
},
),
custom: (),
node: JoinedStr {
values: [
Located {
location: Location {
row: 1,
column: 0,
},
end_location: Some(
Location {
row: 1,
column: 11,
},
),
custom: (),
node: FormattedValue {
value: Located {
location: Location {
row: 1,
column: 6,
},
end_location: Some(
Location {
row: 1,
column: 7,
},
),
custom: (),
node: Name {
id: "x",
ctx: Load,
},
},
conversion: 0,
format_spec: None,
},
},
],
},
},
},
},
]

View file

@ -1,35 +1,79 @@
use crate::{
ast::{Constant, Expr, ExprKind, Location},
error::{LexicalError, LexicalErrorType},
fstring::parse_located_fstring,
string_parser::parse_string,
token::StringKind,
};
use itertools::Itertools;
pub fn parse_strings(
values: Vec<(Location, (String, StringKind), Location)>,
values: Vec<(Location, (String, StringKind, bool), Location)>,
) -> Result<Expr, LexicalError> {
// Preserve the initial location and kind.
let initial_start = values[0].0;
let last_end = values.last().unwrap().2;
let initial_kind = (values[0].1 .1 == StringKind::U).then(|| "u".to_owned());
let initial_kind = (values[0].1 .1 == StringKind::Unicode).then(|| "u".to_owned());
let has_fstring = values.iter().any(|(_, (_, kind, ..), _)| kind.is_fstring());
let num_bytes = values
.iter()
.filter(|(_, (_, kind, ..), _)| kind.is_bytes())
.count();
let has_bytes = num_bytes > 0;
// Optimization: fast-track the common case of a single string.
if matches!(&*values, [(_, (_, StringKind::Normal | StringKind::U), _)]) {
let value = values.into_iter().last().unwrap().1 .0;
if has_bytes && num_bytes < values.len() {
return Err(LexicalError {
error: LexicalErrorType::OtherError(
"cannot mix bytes and nonbytes literals".to_owned(),
),
location: initial_start,
});
}
if has_bytes {
let mut content: Vec<u8> = vec![];
for (start, (source, kind, triple_quoted), end) in values {
for value in parse_string(&source, kind, triple_quoted, start, end)? {
match value.node {
ExprKind::Constant {
value: Constant::Bytes(value),
..
} => content.extend(value),
_ => unreachable!("Unexpected non-bytes expression."),
}
}
}
return Ok(Expr::new(
initial_start,
last_end,
ExprKind::Constant {
value: Constant::Str(value),
kind: initial_kind,
value: Constant::Bytes(content),
kind: None,
},
));
}
// Determine whether the list of values contains any f-strings. (If not, we can return a
// single Constant at the end, rather than a JoinedStr.)
let mut has_fstring = false;
if !has_fstring {
let mut content: Vec<String> = vec![];
for (start, (source, kind, triple_quoted), end) in values {
for value in parse_string(&source, kind, triple_quoted, start, end)? {
match value.node {
ExprKind::Constant {
value: Constant::Str(value),
..
} => content.push(value),
_ => unreachable!("Unexpected non-string expression."),
}
}
}
return Ok(Expr::new(
initial_start,
last_end,
ExprKind::Constant {
value: Constant::Str(content.join("")),
kind: initial_kind,
},
));
}
// De-duplicate adjacent constants.
let mut deduped: Vec<Expr> = vec![];
@ -46,17 +90,8 @@ pub fn parse_strings(
)
};
for (start, (string, string_kind), end) in values {
match string_kind {
StringKind::Normal | StringKind::U => current.push(string),
StringKind::F => {
has_fstring = true;
for value in
parse_located_fstring(&string, start, end).map_err(|e| LexicalError {
location: start,
error: LexicalErrorType::FStringError(e.error),
})?
{
for (start, (source, kind, triple_quoted), end) in values {
for value in parse_string(&source, kind, triple_quoted, start, end)? {
match value.node {
ExprKind::FormattedValue { .. } => {
if !current.is_empty() {
@ -64,19 +99,14 @@ pub fn parse_strings(
}
deduped.push(value)
}
ExprKind::Constant { value, .. } => {
if let Constant::Str(value) = value {
current.push(value);
} else {
unreachable!("Unexpected non-string constant.");
}
}
ExprKind::Constant {
value: Constant::Str(value),
..
} => current.push(value),
_ => unreachable!("Unexpected non-string expression."),
}
}
}
}
}
if !current.is_empty() {
deduped.push(take_current(&mut current));
}
@ -101,64 +131,153 @@ mod tests {
#[test]
fn test_parse_string_concat() {
let source = String::from("'Hello ' 'world'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "'Hello ' 'world'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_u_string_concat_1() {
let source = String::from("'Hello ' u'world'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "'Hello ' u'world'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_u_string_concat_2() {
let source = String::from("u'Hello ' 'world'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "u'Hello ' 'world'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_f_string_concat_1() {
let source = String::from("'Hello ' f'world'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "'Hello ' f'world'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_f_string_concat_2() {
let source = String::from("'Hello ' f'world'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "'Hello ' f'world'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_f_string_concat_3() {
let source = String::from("'Hello ' f'world{\"!\"}'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "'Hello ' f'world{\"!\"}'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_u_f_string_concat_1() {
let source = String::from("u'Hello ' f'world'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "u'Hello ' f'world'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_u_f_string_concat_2() {
let source = String::from("u'Hello ' f'world' '!'");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "u'Hello ' f'world' '!'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_parse_string_triple_quotes_with_kind() {
let source = String::from("u'''Hello, world!'''");
let parse_ast = parse_program(&source, "<test>").unwrap();
let source = "u'''Hello, world!'''";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_single_quoted_byte() {
// single quote
let source = r##"b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'"##;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_double_quoted_byte() {
// double quote
let source = r##"b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff""##;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_escape_char_in_byte_literal() {
// backslash does not escape
let source = r##"b"omkmok\Xaa""##;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_raw_byte_literal_1() {
let source = r"rb'\x1z'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_raw_byte_literal_2() {
let source = r"rb'\\'";
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_escape_octet() {
let source = r##"b'\43a\4\1234'"##;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_fstring_escaped_newline() {
let source = r#"f"\n{x}""#;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_fstring_unescaped_newline() {
let source = r#"f"""
{x}""""#;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_fstring_escaped_character() {
let source = r#"f"\\{x}""#;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_raw_fstring() {
let source = r#"rf"{x}""#;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_triple_quoted_raw_fstring() {
let source = r#"rf"""{x}""""#;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_fstring_line_continuation() {
let source = r#"rf"\
{x}""#;
let parse_ast = parse_program(source, "<test>").unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
}

562
parser/src/string_parser.rs Normal file
View file

@ -0,0 +1,562 @@
use self::FStringErrorType::*;
use crate::{
ast::{Constant, ConversionFlag, Expr, ExprKind, Location},
error::{FStringErrorType, LexicalError, LexicalErrorType, ParseError},
parser::parse_expression_located,
token::StringKind,
};
use std::{iter, mem, str};
/// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798
pub const MAX_UNICODE_NAME: usize = 88;
pub struct StringParser<'a> {
chars: iter::Peekable<str::Chars<'a>>,
kind: StringKind,
str_start: Location,
str_end: Location,
location: Location,
}
impl<'a> StringParser<'a> {
pub fn new(
source: &'a str,
kind: StringKind,
triple_quoted: bool,
str_start: Location,
str_end: Location,
) -> Self {
let offset = kind.to_string().len() + if triple_quoted { 3 } else { 1 };
Self {
chars: source.chars().peekable(),
kind,
str_start,
str_end,
location: Location::new(str_start.row(), str_start.column() + offset),
}
}
fn next_char(&mut self) -> Option<char> {
let Some(c) = self.chars.next() else {
return None
};
if c == '\n' {
self.location.newline();
} else {
self.location.go_right();
}
Some(c)
}
fn peek(&mut self) -> Option<&char> {
self.chars.peek()
}
fn get_pos(&self) -> Location {
self.location
}
#[inline]
fn expr(&self, node: ExprKind) -> Expr {
Expr::new(self.str_start, self.str_end, node)
}
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
let mut p: u32 = 0u32;
let unicode_error = LexicalError {
error: LexicalErrorType::UnicodeError,
location: self.get_pos(),
};
for i in 1..=literal_number {
match self.next_char() {
Some(c) => match c.to_digit(16) {
Some(d) => p += d << ((literal_number - i) * 4),
None => return Err(unicode_error),
},
None => return Err(unicode_error),
}
}
match p {
0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER),
_ => std::char::from_u32(p).ok_or(unicode_error),
}
}
fn parse_octet(&mut self, first: char) -> char {
let mut octet_content = String::new();
octet_content.push(first);
while octet_content.len() < 3 {
if let Some('0'..='7') = self.peek() {
octet_content.push(self.next_char().unwrap())
} else {
break;
}
}
let value = u32::from_str_radix(&octet_content, 8).unwrap();
char::from_u32(value).unwrap()
}
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
let start_pos = self.get_pos();
match self.next_char() {
Some('{') => {}
_ => {
return Err(LexicalError {
error: LexicalErrorType::StringError,
location: start_pos,
})
}
}
let start_pos = self.get_pos();
let mut name = String::new();
loop {
match self.next_char() {
Some('}') => break,
Some(c) => name.push(c),
None => {
return Err(LexicalError {
error: LexicalErrorType::StringError,
location: self.get_pos(),
})
}
}
}
if name.len() > MAX_UNICODE_NAME {
return Err(LexicalError {
error: LexicalErrorType::UnicodeError,
location: self.get_pos(),
});
}
unicode_names2::character(&name).ok_or(LexicalError {
error: LexicalErrorType::UnicodeError,
location: start_pos,
})
}
fn parse_escaped_char(&mut self) -> Result<String, LexicalError> {
match self.next_char() {
Some(c) => Ok(match c {
'\\' => '\\'.to_string(),
'\'' => '\''.to_string(),
'\"' => '"'.to_string(),
'\n' => "".to_string(),
'a' => '\x07'.to_string(),
'b' => '\x08'.to_string(),
'f' => '\x0c'.to_string(),
'n' => '\n'.to_string(),
'r' => '\r'.to_string(),
't' => '\t'.to_string(),
'v' => '\x0b'.to_string(),
o @ '0'..='7' => self.parse_octet(o).to_string(),
'x' => self.parse_unicode_literal(2)?.to_string(),
'u' if !self.kind.is_bytes() => self.parse_unicode_literal(4)?.to_string(),
'U' if !self.kind.is_bytes() => self.parse_unicode_literal(8)?.to_string(),
'N' if !self.kind.is_bytes() => self.parse_unicode_name()?.to_string(),
c => {
if self.kind.is_bytes() && !c.is_ascii() {
return Err(LexicalError {
error: LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters".to_owned(),
),
location: self.get_pos(),
});
}
format!("\\{c}")
}
}),
None => Err(LexicalError {
error: LexicalErrorType::StringError,
location: self.get_pos(),
}),
}
}
fn parse_formatted_value(&mut self, nested: u8) -> Result<Vec<Expr>, LexicalError> {
let mut expression = String::new();
let mut spec = None;
let mut delims = Vec::new();
let mut conversion = ConversionFlag::None;
let mut self_documenting = false;
let mut trailing_seq = String::new();
let location = self.get_pos();
while let Some(ch) = self.next_char() {
match ch {
// can be integrated better with the remaining code, but as a starting point ok
// in general I would do here a tokenizing of the fstrings to omit this peeking.
'!' if self.peek() == Some(&'=') => {
expression.push_str("!=");
self.next_char();
}
'=' if self.peek() == Some(&'=') => {
expression.push_str("==");
self.next_char();
}
'>' if self.peek() == Some(&'=') => {
expression.push_str(">=");
self.next_char();
}
'<' if self.peek() == Some(&'=') => {
expression.push_str("<=");
self.next_char();
}
'!' if delims.is_empty() && self.peek() != Some(&'=') => {
if expression.trim().is_empty() {
return Err(EmptyExpression.to_lexical_error(self.get_pos()));
}
conversion = match self.next_char() {
Some('s') => ConversionFlag::Str,
Some('a') => ConversionFlag::Ascii,
Some('r') => ConversionFlag::Repr,
Some(_) => {
return Err(if expression.trim().is_empty() {
EmptyExpression.to_lexical_error(self.get_pos())
} else {
InvalidConversionFlag.to_lexical_error(self.get_pos())
});
}
None => {
return Err(if expression.trim().is_empty() {
EmptyExpression.to_lexical_error(self.get_pos())
} else {
UnclosedLbrace.to_lexical_error(self.get_pos())
});
}
};
if let Some(&peek) = self.peek() {
if peek != '}' && peek != ':' {
return Err(if expression.trim().is_empty() {
EmptyExpression.to_lexical_error(self.get_pos())
} else {
UnclosedLbrace.to_lexical_error(self.get_pos())
});
}
} else {
return Err(if expression.trim().is_empty() {
EmptyExpression.to_lexical_error(self.get_pos())
} else {
UnclosedLbrace.to_lexical_error(self.get_pos())
});
}
}
// match a python 3.8 self documenting expression
// format '{' PYTHON_EXPRESSION '=' FORMAT_SPECIFIER? '}'
'=' if self.peek() != Some(&'=') && delims.is_empty() => {
self_documenting = true;
}
':' if delims.is_empty() => {
let parsed_spec = self.parse_spec(nested)?;
spec = Some(Box::new(self.expr(ExprKind::JoinedStr {
values: parsed_spec,
})));
}
'(' | '{' | '[' => {
expression.push(ch);
delims.push(ch);
}
')' => {
let last_delim = delims.pop();
match last_delim {
Some('(') => {
expression.push(ch);
}
Some(c) => {
return Err(
MismatchedDelimiter(c, ')').to_lexical_error(self.get_pos())
);
}
None => {
return Err(Unmatched(')').to_lexical_error(self.get_pos()));
}
}
}
']' => {
let last_delim = delims.pop();
match last_delim {
Some('[') => {
expression.push(ch);
}
Some(c) => {
return Err(
MismatchedDelimiter(c, ']').to_lexical_error(self.get_pos())
);
}
None => {
return Err(Unmatched(']').to_lexical_error(self.get_pos()));
}
}
}
'}' if !delims.is_empty() => {
let last_delim = delims.pop();
match last_delim {
Some('{') => {
expression.push(ch);
}
Some(c) => {
return Err(MismatchedDelimiter(c, '}').to_lexical_error(self.get_pos()))
}
None => {}
}
}
'}' => {
if expression.trim().is_empty() {
return Err(EmptyExpression.to_lexical_error(self.get_pos()));
}
let ret = if !self_documenting {
vec![self.expr(ExprKind::FormattedValue {
value: Box::new(parse_fstring_expr(&expression, location).map_err(
|e| {
InvalidExpression(Box::new(e.error))
.to_lexical_error(self.get_pos())
},
)?),
conversion: conversion as _,
format_spec: spec,
})]
} else {
vec![
self.expr(ExprKind::Constant {
value: Constant::Str(expression.to_owned() + "="),
kind: None,
}),
self.expr(ExprKind::Constant {
value: trailing_seq.into(),
kind: None,
}),
self.expr(ExprKind::FormattedValue {
value: Box::new(
parse_fstring_expr(&expression, location).map_err(|e| {
InvalidExpression(Box::new(e.error))
.to_lexical_error(self.get_pos())
})?,
),
conversion: (if conversion == ConversionFlag::None && spec.is_none()
{
ConversionFlag::Repr
} else {
conversion
}) as _,
format_spec: spec,
}),
]
};
return Ok(ret);
}
'"' | '\'' => {
expression.push(ch);
loop {
let Some(c) = self.next_char() else {
return Err(UnterminatedString.to_lexical_error(self.get_pos()));
};
expression.push(c);
if c == ch {
break;
}
}
}
' ' if self_documenting => {
trailing_seq.push(ch);
}
'\\' => return Err(ExpressionCannotInclude('\\').to_lexical_error(self.get_pos())),
_ => {
if self_documenting {
return Err(UnclosedLbrace.to_lexical_error(self.get_pos()));
}
expression.push(ch);
}
}
}
Err(if expression.trim().is_empty() {
EmptyExpression.to_lexical_error(self.get_pos())
} else {
UnclosedLbrace.to_lexical_error(self.get_pos())
})
}
fn parse_spec(&mut self, nested: u8) -> Result<Vec<Expr>, LexicalError> {
let mut spec_constructor = Vec::new();
let mut constant_piece = String::new();
while let Some(&next) = self.peek() {
match next {
'{' => {
if !constant_piece.is_empty() {
spec_constructor.push(self.expr(ExprKind::Constant {
value: constant_piece.to_owned().into(),
kind: None,
}));
constant_piece.clear();
}
let parsed_expr = self.parse_fstring(nested + 1)?;
spec_constructor.extend(parsed_expr);
continue;
}
'}' => {
break;
}
_ => {
constant_piece.push(next);
}
}
self.next_char();
}
if !constant_piece.is_empty() {
spec_constructor.push(self.expr(ExprKind::Constant {
value: constant_piece.to_owned().into(),
kind: None,
}));
constant_piece.clear();
}
Ok(spec_constructor)
}
fn parse_fstring(&mut self, nested: u8) -> Result<Vec<Expr>, LexicalError> {
if nested >= 2 {
return Err(ExpressionNestedTooDeeply.to_lexical_error(self.get_pos()));
}
let mut content = String::new();
let mut values = vec![];
while let Some(&ch) = self.peek() {
match ch {
'{' => {
self.next_char();
if nested == 0 {
match self.peek() {
Some('{') => {
self.next_char();
content.push('{');
continue;
}
None => return Err(UnclosedLbrace.to_lexical_error(self.get_pos())),
_ => {}
}
}
if !content.is_empty() {
values.push(self.expr(ExprKind::Constant {
value: mem::take(&mut content).into(),
kind: None,
}));
}
let parsed_values = self.parse_formatted_value(nested)?;
values.extend(parsed_values);
}
'}' => {
if nested > 0 {
break;
}
self.next_char();
if let Some('}') = self.peek() {
self.next_char();
content.push('}');
} else {
return Err(SingleRbrace.to_lexical_error(self.get_pos()));
}
}
'\\' if !self.kind.is_raw() => {
self.next_char();
content.push_str(&self.parse_escaped_char()?);
}
_ => {
content.push(ch);
self.next_char();
}
}
}
if !content.is_empty() {
values.push(self.expr(ExprKind::Constant {
value: content.into(),
kind: None,
}))
}
Ok(values)
}
pub fn parse_bytes(&mut self) -> Result<Expr, LexicalError> {
let mut content = String::new();
while let Some(ch) = self.next_char() {
match ch {
'\\' if !self.kind.is_raw() => {
content.push_str(&self.parse_escaped_char()?);
}
ch => {
if !ch.is_ascii() {
return Err(LexicalError {
error: LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters".to_string(),
),
location: self.get_pos(),
});
}
content.push(ch);
}
}
}
Ok(self.expr(ExprKind::Constant {
value: Constant::Bytes(content.chars().map(|c| c as u8).collect()),
kind: None,
}))
}
pub fn parse_string(&mut self) -> Result<Expr, LexicalError> {
let mut content = String::new();
while let Some(ch) = self.next_char() {
match ch {
'\\' if !self.kind.is_raw() => {
content.push_str(&self.parse_escaped_char()?);
}
ch => content.push(ch),
}
}
Ok(self.expr(ExprKind::Constant {
value: Constant::Str(content),
kind: self.kind.is_unicode().then(|| "u".to_string()),
}))
}
pub fn parse(&mut self) -> Result<Vec<Expr>, LexicalError> {
if self.kind.is_fstring() {
self.parse_fstring(0)
} else if self.kind.is_bytes() {
self.parse_bytes().map(|expr| vec![expr])
} else {
self.parse_string().map(|expr| vec![expr])
}
}
}
fn parse_fstring_expr(source: &str, location: Location) -> Result<Expr, ParseError> {
let fstring_body = format!("({source})");
parse_expression_located(
&fstring_body,
"<fstring>",
Location::new(location.row(), location.column() - 1),
)
}
pub fn parse_string(
source: &str,
kind: StringKind,
triple_quoted: bool,
start: Location,
end: Location,
) -> Result<Vec<Expr>, LexicalError> {
StringParser::new(source, kind, triple_quoted, start, end).parse()
}

View file

@ -1,17 +1,29 @@
//! Different token definitions.
//! Loosely based on token.h from CPython source:
use num_bigint::BigInt;
use std::fmt::{self, Write};
use std::fmt;
/// Python source code can be tokenized in a sequence of these tokens.
#[derive(Clone, Debug, PartialEq)]
pub enum Tok {
Name { name: String },
Int { value: BigInt },
Float { value: f64 },
Complex { real: f64, imag: f64 },
String { value: String, kind: StringKind },
Bytes { value: Vec<u8> },
Name {
name: String,
},
Int {
value: BigInt,
},
Float {
value: f64,
},
Complex {
real: f64,
imag: f64,
},
String {
value: String,
kind: StringKind,
triple_quoted: bool,
},
Newline,
Indent,
Dedent,
@ -107,13 +119,6 @@ pub enum Tok {
Yield,
}
#[derive(PartialEq, Eq, Debug, Clone)]
pub enum StringKind {
Normal,
F,
U,
}
impl fmt::Display for Tok {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use Tok::*;
@ -122,26 +127,13 @@ impl fmt::Display for Tok {
Int { value } => write!(f, "'{value}'"),
Float { value } => write!(f, "'{value}'"),
Complex { real, imag } => write!(f, "{real}j{imag}"),
String { value, kind } => {
match kind {
StringKind::F => f.write_str("f")?,
StringKind::U => f.write_str("u")?,
StringKind::Normal => {}
}
write!(f, "{value:?}")
}
Bytes { value } => {
write!(f, "b\"")?;
for i in value {
match i {
9 => f.write_str("\\t")?,
10 => f.write_str("\\n")?,
13 => f.write_str("\\r")?,
32..=126 => f.write_char(*i as char)?,
_ => write!(f, "\\x{i:02x}")?,
}
}
f.write_str("\"")
String {
value,
kind,
triple_quoted,
} => {
let quotes = "\"".repeat(if *triple_quoted { 3 } else { 1 });
write!(f, "{kind}{quotes}{value}{quotes}")
}
Newline => f.write_str("Newline"),
Indent => f.write_str("Indent"),
@ -236,3 +228,50 @@ impl fmt::Display for Tok {
}
}
}
#[derive(PartialEq, Eq, Debug, Clone)]
pub enum StringKind {
String,
FString,
Bytes,
RawString,
RawFString,
RawBytes,
Unicode,
}
impl fmt::Display for StringKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use StringKind::*;
match self {
String => f.write_str(""),
FString => f.write_str("f"),
Bytes => f.write_str("b"),
RawString => f.write_str("r"),
RawFString => f.write_str("rf"),
RawBytes => f.write_str("rb"),
Unicode => f.write_str("u"),
}
}
}
impl StringKind {
pub fn is_raw(&self) -> bool {
use StringKind::{RawBytes, RawFString, RawString};
matches!(self, RawString | RawFString | RawBytes)
}
pub fn is_fstring(&self) -> bool {
use StringKind::{FString, RawFString};
matches!(self, FString | RawFString)
}
pub fn is_bytes(&self) -> bool {
use StringKind::{Bytes, RawBytes};
matches!(self, Bytes | RawBytes)
}
pub fn is_unicode(&self) -> bool {
matches!(self, StringKind::Unicode)
}
}