diff --git a/compiler/erg_compiler/context/initialize/mod.rs b/compiler/erg_compiler/context/initialize/mod.rs index e367056f..58072bd2 100644 --- a/compiler/erg_compiler/context/initialize/mod.rs +++ b/compiler/erg_compiler/context/initialize/mod.rs @@ -1794,6 +1794,7 @@ impl Context { let t_exit = t_quit.clone(); let t_repr = nd_func(vec![kw("object", Obj)], None, Str); let t_round = nd_func(vec![kw("number", Float)], None, Int); + let t_str = nd_func(vec![kw("object", Obj)], None, Str); let t_unreachable = nd_func(vec![], None, Never); self.register_builtin_py_impl("abs", t_abs, Immutable, Private, Some("abs")); self.register_builtin_py_impl("ascii", t_ascii, Immutable, Private, Some("ascii")); @@ -1855,6 +1856,7 @@ impl Context { self.register_builtin_py_impl("quit", t_quit, Immutable, Private, Some("quit")); self.register_builtin_py_impl("repr", t_repr, Immutable, Private, Some("repr")); self.register_builtin_py_impl("round", t_round, Immutable, Private, Some("round")); + self.register_builtin_py_impl("str", t_str, Immutable, Private, Some("str")); // TODO: original implementation self.register_builtin_py_impl( "unreachable", diff --git a/compiler/erg_parser/lex.rs b/compiler/erg_parser/lex.rs index 835bf95c..160d10d5 100644 --- a/compiler/erg_parser/lex.rs +++ b/compiler/erg_parser/lex.rs @@ -69,6 +69,19 @@ impl Runnable for LexerRunner { } } +#[derive(Debug, PartialEq, Eq)] +pub enum Interpolation { + SingleLine, + MultiLine, + Not, +} + +impl Interpolation { + pub const fn is_in(&self) -> bool { + matches!(self, Self::SingleLine | Self::MultiLine) + } +} + /// Lexes a source code and iterates tokens. /// /// This can be used as an iterator or to generate a `TokenStream`. @@ -85,6 +98,7 @@ pub struct Lexer /*<'a>*/ { lineno_token_starts: usize, /// 0-origin, indicates the column number in which the token appears col_token_starts: usize, + interpol_stack: Vec, } impl Lexer /*<'a>*/ { @@ -98,6 +112,7 @@ impl Lexer /*<'a>*/ { prev_token: Token::new(TokenKind::BOF, "", 0, 0), lineno_token_starts: 0, col_token_starts: 0, + interpol_stack: vec![Interpolation::Not], } } @@ -112,6 +127,7 @@ impl Lexer /*<'a>*/ { prev_token: Token::new(TokenKind::BOF, "", 0, 0), lineno_token_starts: 0, col_token_starts: 0, + interpol_stack: vec![Interpolation::Not], } } @@ -623,32 +639,94 @@ impl Lexer /*<'a>*/ { Ok(self.emit_token(kind, &cont)) } - fn lex_str(&mut self) -> LexResult { + fn str_line_break_error(token: Token, line: usize) -> LexError { + LexError::syntax_error( + line, + token.loc(), + switch_lang!( + "japanese" => "文字列内で改行をすることはできません", + "simplified_chinese" => "在一个字符串中不允许有换行符", + "traditional_chinese" => "在一個字符串中不允許有換行符", + "english" => "Line breaks are not allowed within a string", + ), + Some( + switch_lang!( + "japanese" => "\"\"内で改行を使いたい場合は'\\n'を利用してください", + "simplified_chinese" => "如果你想在\"\"中使用换行符,请使用'\\n'", + "traditional_chinese" => "如果你想在\"\"中使用換行符,請使用'\\n'", + "english" => "If you want to use line breaks within \"\", use '\\n'", + ) + .into(), + ), + ) + } + + fn invalid_escape_error(ch: char, token: Token) -> LexError { + LexError::syntax_error( + 0, + token.loc(), + switch_lang!( + "japanese" => format!("不正なエスケープシーケンスです: \\{}", ch), + "simplified_chinese" => format!("不合法的转义序列: \\{}", ch), + "traditional_chinese" => format!("不合法的轉義序列: \\{}", ch), + "english" => format!("illegal escape sequence: \\{}", ch), + ), + None, + ) + } + + fn unclosed_string_error(token: Token, by: &str, line: usize) -> LexError { + let by = if by.is_empty() { + "".to_string() + } else { + switch_lang!( + "japanese" => format!("\"\"\"によって"), + "simplified_chinese" => format!("\"\"\"关"), + "traditional_chinese" => format!("\"\"\"关"), + "english" => format!("by \"\"\""), + ) + }; + LexError::syntax_error( + line, + token.loc(), + switch_lang!( + "japanese" => format!("文字列が{by}閉じられていません"), + "simplified_chinese" => format!("字符串没有被{by}闭"), + "traditional_chinese" => format!("字符串没有被{by}闭"), + "english" => format!("the string is not closed {by}"), + ), + None, + ) + } + + fn unclosed_interpol_error(token: Token) -> LexError { + LexError::syntax_error( + 0, + token.loc(), + switch_lang!( + "japanese" => "文字列内の補間が閉じられていません", + "simplified_chinese" => "字符串内的插值没有被闭", + "traditional_chinese" => "字符串內的插值沒有被閉", + "english" => "the interpolation in the string is not closed", + ), + None, + ) + } + + fn lex_single_str(&mut self) -> LexResult { let mut s = "\"".to_string(); while let Some(c) = self.peek_cur_ch() { match c { - '\n' => { - let token = self.emit_token(Illegal, &s); - return Err(LexError::syntax_error( - 0, - token.loc(), - switch_lang!( - "japanese" => "文字列内で改行をすることはできません", - "simplified_chinese" => "在一个字符串中不允许有换行符", - "traditional_chinese" => "在一個字符串中不允許有換行符", - "english" => "Line breaks are not allowed within a string", - ), - Some( - switch_lang!( - "japanese" => "\"\"内で改行を使いたい場合は'\\n'を利用してください", - "simplified_chinese" => "如果你想在\"\"中使用换行符,请使用'\\n'", - "traditional_chinese" => "如果你想在\"\"中使用換行符,請使用'\\n'", - "english" => "If you want to use line breaks within \"\", use '\\n'", - ) - .into(), - ), - )); - } + '\n' => match self.interpol_stack.last().unwrap() { + Interpolation::SingleLine if self.interpol_stack.len() == 1 => { + let token = self.emit_token(Illegal, &s); + return Err(Self::str_line_break_error(token, line!() as usize)); + } + _ => { + let token = self.emit_token(Illegal, &s); + return Err(Self::unclosed_interpol_error(token)); + } + }, '"' => { s.push(self.consume().unwrap()); let token = self.emit_token(StrLit, &s); @@ -659,6 +737,12 @@ impl Lexer /*<'a>*/ { if c == '\\' { let next_c = self.consume().unwrap(); match next_c { + '{' => { + s.push_str("\\{"); + self.interpol_stack.push(Interpolation::SingleLine); + let token = self.emit_token(StrInterpLeft, &s); + return Ok(token); + } '0' => s.push('\0'), 'r' => s.push('\r'), 'n' => s.push('\n'), @@ -668,17 +752,7 @@ impl Lexer /*<'a>*/ { '\\' => s.push('\\'), _ => { let token = self.emit_token(Illegal, &format!("\\{next_c}")); - return Err(LexError::syntax_error( - 0, - token.loc(), - switch_lang!( - "japanese" => format!("不正なエスケープシーケンスです: \\{}", next_c), - "simplified_chinese" => format!("不合法的转义序列: \\{}", next_c), - "traditional_chinese" => format!("不合法的轉義序列: \\{}", next_c), - "english" => format!("illegal escape sequence: \\{}", next_c), - ), - None, - )); + return Err(Self::invalid_escape_error(next_c, token)); } } } else { @@ -691,17 +765,7 @@ impl Lexer /*<'a>*/ { } } let token = self.emit_token(Illegal, &s); - Err(LexError::syntax_error( - 0, - token.loc(), - switch_lang!( - "japanese" => "文字列が\"によって閉じられていません", - "simplified_chinese" => "字符串没有被\"关闭", - "traditional_chinese" => "字符串没有被\"关闭", - "english" => "the string is not closed by \"", - ), - None, - )) + Err(Self::unclosed_string_error(token, "\"", line!() as usize)) } fn lex_multi_line_str(&mut self) -> LexResult { @@ -712,11 +776,21 @@ impl Lexer /*<'a>*/ { let next_c = self.peek_cur_ch(); let aft_next_c = self.peek_next_ch(); if next_c.is_none() { - return self._unclosed_multi_string(&s); + let token = self.emit_token(Illegal, &s); + return Err(Self::unclosed_string_error( + token, + "\"\"\"", + line!() as usize, + )); } if aft_next_c.is_none() { s.push(self.consume().unwrap()); - return self._unclosed_multi_string(&s); + let token = self.emit_token(Illegal, &s); + return Err(Self::unclosed_string_error( + token, + "\"\"\"", + line!() as usize, + )); } if next_c.unwrap() == '"' && aft_next_c.unwrap() == '"' { self.consume().unwrap(); @@ -725,6 +799,7 @@ impl Lexer /*<'a>*/ { let token = self.emit_token(StrLit, &s); return Ok(token); } + // else unclosed_string_error s.push(c); } else { let c = self.consume().unwrap(); @@ -732,6 +807,12 @@ impl Lexer /*<'a>*/ { '\\' => { let next_c = self.consume().unwrap(); match next_c { + '{' => { + s.push_str("\\{"); + self.interpol_stack.push(Interpolation::MultiLine); + let token = self.emit_token(StrInterpLeft, &s); + return Ok(token); + } '0' => s.push('\0'), 'r' => s.push('\r'), '\'' => s.push('\''), @@ -746,24 +827,14 @@ impl Lexer /*<'a>*/ { } _ => { let token = self.emit_token(Illegal, &format!("\\{next_c}")); - return Err(LexError::syntax_error( - 0, - token.loc(), - switch_lang!( - "japanese" => format!("不正なエスケープシーケンスです: \\{}", next_c), - "simplified_chinese" => format!("不合法的转义序列: \\{}", next_c), - "traditional_chinese" => format!("不合法的轉義序列: \\{}", next_c), - "english" => format!("illegal escape sequence: \\{}", next_c), - ), - None, - )); + return Err(Self::invalid_escape_error(next_c, token)); } } } '\n' => { self.lineno_token_starts += 1; self.col_token_starts = 0; - s.push('\n') + s.push('\n'); } _ => { s.push(c); @@ -774,25 +845,120 @@ impl Lexer /*<'a>*/ { } } } - self._unclosed_multi_string(&s) + let token = self.emit_token(Illegal, &s); + if self.interpol_stack.len() == 1 { + Err(Self::unclosed_string_error( + token, + "\"\"\"", + line!() as usize, + )) + } else { + Err(Self::unclosed_interpol_error(token)) + } } - // for multi-line strings unclosed error - fn _unclosed_multi_string(&mut self, s: &str) -> LexResult { - let col_end = s.rfind('\n').unwrap_or_default(); - let error_s = &s[col_end..s.len() - 1]; - let token = self.emit_token(Illegal, error_s); - Err(LexError::syntax_error( - 0, - token.loc(), - switch_lang!( - "japanese" => "文字列が\"\"\"によって閉じられていません", - "simplified_chinese" => "字符串没有被\"\"\"关闭", - "traditional_chinese" => "字符串没有被\"\"\"关闭", - "english" => "the string is not closed by \"\"\"", - ), - None, - )) + /// e.g. `}aaa"`, `}aaa{` + fn lex_interpolation_mid(&mut self) -> LexResult { + let mut s = "}".to_string(); + while let Some(c) = self.peek_cur_ch() { + match c { + '\n' => match self.interpol_stack.last().unwrap() { + Interpolation::MultiLine => { + self.lineno_token_starts += 1; + self.col_token_starts = 0; + self.consume().unwrap(); + s.push('\n'); + } + Interpolation::SingleLine => { + if self.peek_next_ch().is_some() { + let token = self.emit_token(Illegal, &s); + return Err(Self::str_line_break_error(token, line!() as usize)); + } else { + let token = self.emit_token(Illegal, &s); + return Err(Self::unclosed_string_error(token, "", line!() as usize)); + } + } + Interpolation::Not => { + let token = self.emit_token(Illegal, &s); + return Err(Self::unclosed_interpol_error(token)); + } + }, + '"' => { + s.push(self.consume().unwrap()); + match self.interpol_stack.last().unwrap() { + Interpolation::MultiLine => { + let next_c = self.peek_cur_ch(); + let aft_next_c = self.peek_next_ch(); + if next_c.is_none() { + self.interpol_stack.pop(); + let token = self.emit_token(Illegal, &s); + return Err(Self::unclosed_string_error( + token, + "\"\"\"", + line!() as usize, + )); + } + if aft_next_c.is_none() { + self.interpol_stack.pop(); + s.push(self.consume().unwrap()); + let token = self.emit_token(Illegal, &s); + return Err(Self::unclosed_string_error( + token, + "\"\"\"", + line!() as usize, + )); + } + if next_c.unwrap() == '"' && aft_next_c.unwrap() == '"' { + self.interpol_stack.pop(); + self.consume().unwrap(); + self.consume().unwrap(); + s.push_str("\"\"\""); + let token = self.emit_token(StrInterpRight, &s); + return Ok(token); + } + // else unclosed_string_error + } + Interpolation::SingleLine => { + self.interpol_stack.pop(); + let token = self.emit_token(StrInterpRight, &s); + return Ok(token); + } + Interpolation::Not => {} + } + } + _ => { + let c = self.consume().unwrap(); + if c == '\\' { + let next_c = self.consume().unwrap(); + match next_c { + '{' => { + s.push_str("\\{"); + let token = self.emit_token(StrInterpMid, &s); + return Ok(token); + } + '0' => s.push('\0'), + 'r' => s.push('\r'), + 'n' => s.push('\n'), + '\'' => s.push('\''), + '"' => s.push('"'), + 't' => s.push_str(" "), // tab is invalid, so changed into 4 whitespace + '\\' => s.push('\\'), + _ => { + let token = self.emit_token(Illegal, &format!("\\{next_c}")); + return Err(Self::invalid_escape_error(next_c, token)); + } + } + } else { + s.push(c); + if Self::is_bidi(c) { + return Err(self._invalid_unicode_character(&s)); + } + } + } + } + } + let token = self.emit_token(Illegal, &s); + Err(Self::unclosed_string_error(token, "", line!() as usize)) } fn lex_raw_ident(&mut self) -> LexResult { @@ -878,7 +1044,13 @@ impl Iterator for Lexer /*<'a>*/ { Some('[') => self.accept(LSqBr, "["), Some(']') => self.accept(RSqBr, "]"), Some('{') => self.accept(LBrace, "{"), - Some('}') => self.accept(RBrace, "}"), + Some('}') => { + if self.interpol_stack.last().unwrap().is_in() { + Some(self.lex_interpolation_mid()) + } else { + self.accept(RBrace, "}") + } + } Some('<') => match self.peek_cur_ch() { Some('.') => { self.consume(); @@ -1145,7 +1317,7 @@ impl Iterator for Lexer /*<'a>*/ { let token = self.emit_token(StrLit, "\"\""); Some(Ok(token)) } else { - Some(self.lex_str()) + Some(self.lex_single_str()) } } (Some(c), Some(next_c)) => { @@ -1154,7 +1326,7 @@ impl Iterator for Lexer /*<'a>*/ { self.consume(); // consume third '"' Some(self.lex_multi_line_str()) } else { - Some(self.lex_str()) + Some(self.lex_single_str()) } } } diff --git a/compiler/erg_parser/parse.rs b/compiler/erg_parser/parse.rs index 2d68a66f..1bc8c518 100644 --- a/compiler/erg_parser/parse.rs +++ b/compiler/erg_parser/parse.rs @@ -548,6 +548,7 @@ impl Parser { match self.peek() { Some(t) if t.category_is(TC::Literal) + || t.is(StrInterpLeft) || t.is(Symbol) || t.category_is(TC::UnaryOp) || t.is(LParen) @@ -1442,6 +1443,13 @@ impl Parser { self.level -= 1; Ok(Expr::Lit(lit)) } + Some(t) if t.is(StrInterpLeft) => { + let str_interp = self + .try_reduce_string_interpolation() + .map_err(|_| self.stack_dec())?; + self.level -= 1; + Ok(str_interp) + } Some(t) if t.is(AtSign) => { let decos = self.opt_reduce_decorators()?; let expr = self.try_reduce_chunk(false, in_brace)?; @@ -2161,6 +2169,87 @@ impl Parser { } } + /// "...\{, expr, }..." ==> "..." + str(expr) + "..." + /// "...\{, expr, }..." ==> "..." + str(expr) + "..." + fn try_reduce_string_interpolation(&mut self) -> ParseResult { + debug_call_info!(self); + let mut left = self.lpop(); + left.content = Str::from(left.content.trim_end_matches("\\{").to_string() + "\""); + left.kind = StrLit; + let mut expr = Expr::Lit(Literal::from(left)); + loop { + match self.peek() { + Some(l) if l.is(StrInterpRight) => { + let mut right = self.lpop(); + right.content = + Str::from(format!("\"{}", right.content.trim_start_matches('}'))); + right.kind = StrLit; + let right = Expr::Lit(Literal::from(right)); + let op = Token::new( + Plus, + "+", + right.ln_begin().unwrap(), + right.col_begin().unwrap(), + ); + expr = Expr::BinOp(BinOp::new(op, expr, right)); + self.level -= 1; + return Ok(expr); + } + Some(_) => { + let mid_expr = self.try_reduce_expr(true, false, false, false)?; + let str_func = Expr::local( + "str", + mid_expr.ln_begin().unwrap(), + mid_expr.col_begin().unwrap(), + ); + let call = Call::new( + str_func, + None, + Args::new(vec![PosArg::new(mid_expr)], vec![], None), + ); + let op = Token::new( + Plus, + "+", + call.ln_begin().unwrap(), + call.col_begin().unwrap(), + ); + let bin = BinOp::new(op, expr, Expr::Call(call)); + expr = Expr::BinOp(bin); + if self.cur_is(StrInterpMid) { + let mut mid = self.lpop(); + mid.content = Str::from(format!( + "\"{}\"", + mid.content.trim_start_matches('}').trim_end_matches("\\{") + )); + mid.kind = StrLit; + let mid = Expr::Lit(Literal::from(mid)); + let op = Token::new( + Plus, + "+", + mid.ln_begin().unwrap(), + mid.col_begin().unwrap(), + ); + expr = Expr::BinOp(BinOp::new(op, expr, mid)); + } + } + None => { + self.level -= 1; + let err = ParseError::syntax_error( + line!() as usize, + expr.loc(), + switch_lang!( + "japanese" => "文字列補間の終わりが見つかりませんでした", + "english" => "end of string interpolation not found", + ), + None, + ); + self.errs.push(err); + return Err(()); + } + } + } + } + /// x |> f() => f(x) fn try_reduce_stream_operator(&mut self, stack: &mut Vec) -> ParseResult<()> { debug_call_info!(self); diff --git a/examples/helloworld.er b/examples/helloworld.er index 6dc7837f..e989381e 100644 --- a/examples/helloworld.er +++ b/examples/helloworld.er @@ -3,5 +3,6 @@ print! "こんにちは、世界!" print! "Γειά σου Κόσμε!" print! "!مرحبا بالعالم" -greeting = "Hello" -print! "{greeting}, world!" +greetings = ["Good morning", "Hello", "Good evening"] +for! greetings, greeting => + print! "\{greeting}, world!"