Implement string interpolation

2025-09-29 12:24:45 +00:00 · 2022-12-05 21:24:10 +09:00 · 2022-12-05 21:24:10 +09:00 · 47132cfab1
commit 47132cfab1
parent c569df390c
4 changed files with 345 additions and 81 deletions
--- a/compiler/erg_compiler/context/initialize/mod.rs
+++ b/compiler/erg_compiler/context/initialize/mod.rs
@ -1794,6 +1794,7 @@ impl Context {
        let t_exit = t_quit.clone();
        let t_repr = nd_func(vec![kw("object", Obj)], None, Str);
        let t_round = nd_func(vec![kw("number", Float)], None, Int);
        let t_str = nd_func(vec![kw("object", Obj)], None, Str);
        let t_unreachable = nd_func(vec![], None, Never);
        self.register_builtin_py_impl("abs", t_abs, Immutable, Private, Some("abs"));
        self.register_builtin_py_impl("ascii", t_ascii, Immutable, Private, Some("ascii"));
@ -1855,6 +1856,7 @@ impl Context {
        self.register_builtin_py_impl("quit", t_quit, Immutable, Private, Some("quit"));
        self.register_builtin_py_impl("repr", t_repr, Immutable, Private, Some("repr"));
        self.register_builtin_py_impl("round", t_round, Immutable, Private, Some("round"));
        self.register_builtin_py_impl("str", t_str, Immutable, Private, Some("str"));
        // TODO: original implementation
        self.register_builtin_py_impl(
            "unreachable",
--- a/compiler/erg_parser/lex.rs
+++ b/compiler/erg_parser/lex.rs
@ -69,6 +69,19 @@ impl Runnable for LexerRunner {
    }
 }
 #[derive(Debug, PartialEq, Eq)]
 pub enum Interpolation {
    SingleLine,
    MultiLine,
    Not,
 }
 impl Interpolation {
    pub const fn is_in(&self) -> bool {
        matches!(self, Self::SingleLine | Self::MultiLine)
    }
 }
 /// Lexes a source code and iterates tokens.
 ///
 /// This can be used as an iterator or to generate a `TokenStream`.
@ -85,6 +98,7 @@ pub struct Lexer /*<'a>*/ {
    lineno_token_starts: usize,
    /// 0-origin, indicates the column number in which the token appears
    col_token_starts: usize,
    interpol_stack: Vec<Interpolation>,
 }
 impl Lexer /*<'a>*/ {
@ -98,6 +112,7 @@ impl Lexer /*<'a>*/ {
            prev_token: Token::new(TokenKind::BOF, "", 0, 0),
            lineno_token_starts: 0,
            col_token_starts: 0,
            interpol_stack: vec![Interpolation::Not],
        }
    }
@ -112,6 +127,7 @@ impl Lexer /*<'a>*/ {
            prev_token: Token::new(TokenKind::BOF, "", 0, 0),
            lineno_token_starts: 0,
            col_token_starts: 0,
            interpol_stack: vec![Interpolation::Not],
        }
    }
@ -623,14 +639,9 @@ impl Lexer /*<'a>*/ {
        Ok(self.emit_token(kind, &cont))
    }
-    fn lex_str(&mut self) -> LexResult<Token> {
+    fn str_line_break_error(token: Token, line: usize) -> LexError {
-        let mut s = "\"".to_string();
+        LexError::syntax_error(
-        while let Some(c) = self.peek_cur_ch() {
+            line,
            match c {
                '\n' => {
                    let token = self.emit_token(Illegal, &s);
                    return Err(LexError::syntax_error(
                        0,
            token.loc(),
            switch_lang!(
                "japanese" => "文字列内で改行をすることはできません",
@ -647,8 +658,75 @@ impl Lexer /*<'a>*/ {
                )
                .into(),
            ),
-                    ));
+        )
    }
    fn invalid_escape_error(ch: char, token: Token) -> LexError {
        LexError::syntax_error(
            0,
            token.loc(),
            switch_lang!(
                "japanese" => format!("不正なエスケープシーケンスです: \\{}", ch),
                "simplified_chinese" => format!("不合法的转义序列: \\{}", ch),
                "traditional_chinese" => format!("不合法的轉義序列: \\{}", ch),
                "english" => format!("illegal escape sequence: \\{}", ch),
            ),
            None,
        )
    }
    fn unclosed_string_error(token: Token, by: &str, line: usize) -> LexError {
        let by = if by.is_empty() {
            "".to_string()
        } else {
            switch_lang!(
                "japanese" => format!("\"\"\"によって"),
                "simplified_chinese" => format!("\"\"\"关"),
                "traditional_chinese" => format!("\"\"\"关"),
                "english" => format!("by \"\"\""),
            )
        };
        LexError::syntax_error(
            line,
            token.loc(),
            switch_lang!(
                "japanese" => format!("文字列が{by}閉じられていません"),
                "simplified_chinese" => format!("字符串没有被{by}闭"),
                "traditional_chinese" => format!("字符串没有被{by}闭"),
                "english" => format!("the string is not closed {by}"),
            ),
            None,
        )
    }
    fn unclosed_interpol_error(token: Token) -> LexError {
        LexError::syntax_error(
            0,
            token.loc(),
            switch_lang!(
                "japanese" => "文字列内の補間が閉じられていません",
                "simplified_chinese" => "字符串内的插值没有被闭",
                "traditional_chinese" => "字符串內的插值沒有被閉",
                "english" => "the interpolation in the string is not closed",
            ),
            None,
        )
    }
    fn lex_single_str(&mut self) -> LexResult<Token> {
        let mut s = "\"".to_string();
        while let Some(c) = self.peek_cur_ch() {
            match c {
                '\n' => match self.interpol_stack.last().unwrap() {
                    Interpolation::SingleLine if self.interpol_stack.len() == 1 => {
                        let token = self.emit_token(Illegal, &s);
                        return Err(Self::str_line_break_error(token, line!() as usize));
                    }
                    _ => {
                        let token = self.emit_token(Illegal, &s);
                        return Err(Self::unclosed_interpol_error(token));
                    }
                },
                '"' => {
                    s.push(self.consume().unwrap());
                    let token = self.emit_token(StrLit, &s);
@ -659,6 +737,12 @@ impl Lexer /*<'a>*/ {
                    if c == '\\' {
                        let next_c = self.consume().unwrap();
                        match next_c {
                            '{' => {
                                s.push_str("\\{");
                                self.interpol_stack.push(Interpolation::SingleLine);
                                let token = self.emit_token(StrInterpLeft, &s);
                                return Ok(token);
                            }
                            '0' => s.push('\0'),
                            'r' => s.push('\r'),
                            'n' => s.push('\n'),
@ -668,17 +752,7 @@ impl Lexer /*<'a>*/ {
                            '\\' => s.push('\\'),
                            _ => {
                                let token = self.emit_token(Illegal, &format!("\\{next_c}"));
-                                return Err(LexError::syntax_error(
+                                return Err(Self::invalid_escape_error(next_c, token));
                                    0,
                                    token.loc(),
                                    switch_lang!(
                                        "japanese" => format!("不正なエスケープシーケンスです: \\{}", next_c),
                                        "simplified_chinese" => format!("不合法的转义序列: \\{}", next_c),
                                        "traditional_chinese" => format!("不合法的轉義序列: \\{}", next_c),
                                        "english" => format!("illegal escape sequence: \\{}", next_c),
                                    ),
                                    None,
                                ));
                            }
                        }
                    } else {
@ -691,17 +765,7 @@ impl Lexer /*<'a>*/ {
            }
        }
        let token = self.emit_token(Illegal, &s);
-        Err(LexError::syntax_error(
+        Err(Self::unclosed_string_error(token, "\"", line!() as usize))
            0,
            token.loc(),
            switch_lang!(
                "japanese" => "文字列が\"によって閉じられていません",
                "simplified_chinese" => "字符串没有被\"关闭",
                "traditional_chinese" => "字符串没有被\"关闭",
                "english" => "the string is not closed by \"",
            ),
            None,
        ))
    }
    fn lex_multi_line_str(&mut self) -> LexResult<Token> {
@ -712,11 +776,21 @@ impl Lexer /*<'a>*/ {
                let next_c = self.peek_cur_ch();
                let aft_next_c = self.peek_next_ch();
                if next_c.is_none() {
-                    return self._unclosed_multi_string(&s);
+                    let token = self.emit_token(Illegal, &s);
                    return Err(Self::unclosed_string_error(
                        token,
                        "\"\"\"",
                        line!() as usize,
                    ));
                }
                if aft_next_c.is_none() {
                    s.push(self.consume().unwrap());
-                    return self._unclosed_multi_string(&s);
+                    let token = self.emit_token(Illegal, &s);
                    return Err(Self::unclosed_string_error(
                        token,
                        "\"\"\"",
                        line!() as usize,
                    ));
                }
                if next_c.unwrap() == '"' && aft_next_c.unwrap() == '"' {
                    self.consume().unwrap();
@ -725,6 +799,7 @@ impl Lexer /*<'a>*/ {
                    let token = self.emit_token(StrLit, &s);
                    return Ok(token);
                }
                // else unclosed_string_error
                s.push(c);
            } else {
                let c = self.consume().unwrap();
@ -732,6 +807,12 @@ impl Lexer /*<'a>*/ {
                    '\\' => {
                        let next_c = self.consume().unwrap();
                        match next_c {
                            '{' => {
                                s.push_str("\\{");
                                self.interpol_stack.push(Interpolation::MultiLine);
                                let token = self.emit_token(StrInterpLeft, &s);
                                return Ok(token);
                            }
                            '0' => s.push('\0'),
                            'r' => s.push('\r'),
                            '\'' => s.push('\''),
@ -746,24 +827,14 @@ impl Lexer /*<'a>*/ {
                            }
                            _ => {
                                let token = self.emit_token(Illegal, &format!("\\{next_c}"));
-                                return Err(LexError::syntax_error(
+                                return Err(Self::invalid_escape_error(next_c, token));
                                    0,
                                    token.loc(),
                                    switch_lang!(
                                        "japanese" => format!("不正なエスケープシーケンスです: \\{}", next_c),
                                        "simplified_chinese" => format!("不合法的转义序列: \\{}", next_c),
                                        "traditional_chinese" => format!("不合法的轉義序列: \\{}", next_c),
                                        "english" => format!("illegal escape sequence: \\{}", next_c),
                                    ),
                                    None,
                                ));
                            }
                        }
                    }
                    '\n' => {
                        self.lineno_token_starts += 1;
                        self.col_token_starts = 0;
-                        s.push('\n')
+                        s.push('\n');
                    }
                    _ => {
                        s.push(c);
@ -774,25 +845,120 @@ impl Lexer /*<'a>*/ {
                }
            }
        }
-        self._unclosed_multi_string(&s)
+        let token = self.emit_token(Illegal, &s);
        if self.interpol_stack.len() == 1 {
            Err(Self::unclosed_string_error(
                token,
                "\"\"\"",
                line!() as usize,
            ))
        } else {
            Err(Self::unclosed_interpol_error(token))
        }
    }
-    // for multi-line strings unclosed error
+    /// e.g. `}aaa"`, `}aaa{`
-    fn _unclosed_multi_string(&mut self, s: &str) -> LexResult<Token> {
+    fn lex_interpolation_mid(&mut self) -> LexResult<Token> {
-        let col_end = s.rfind('\n').unwrap_or_default();
+        let mut s = "}".to_string();
-        let error_s = &s[col_end..s.len() - 1];
+        while let Some(c) = self.peek_cur_ch() {
-        let token = self.emit_token(Illegal, error_s);
+            match c {
-        Err(LexError::syntax_error(
+                '\n' => match self.interpol_stack.last().unwrap() {
-            0,
+                    Interpolation::MultiLine => {
-            token.loc(),
+                        self.lineno_token_starts += 1;
-            switch_lang!(
+                        self.col_token_starts = 0;
-                "japanese" => "文字列が\"\"\"によって閉じられていません",
+                        self.consume().unwrap();
-                "simplified_chinese" => "字符串没有被\"\"\"关闭",
+                        s.push('\n');
-                "traditional_chinese" => "字符串没有被\"\"\"关闭",
+                    }
-                "english" => "the string is not closed by \"\"\"",
+                    Interpolation::SingleLine => {
-            ),
+                        if self.peek_next_ch().is_some() {
-            None,
+                            let token = self.emit_token(Illegal, &s);
-        ))
+                            return Err(Self::str_line_break_error(token, line!() as usize));
                        } else {
                            let token = self.emit_token(Illegal, &s);
                            return Err(Self::unclosed_string_error(token, "", line!() as usize));
                        }
                    }
                    Interpolation::Not => {
                        let token = self.emit_token(Illegal, &s);
                        return Err(Self::unclosed_interpol_error(token));
                    }
                },
                '"' => {
                    s.push(self.consume().unwrap());
                    match self.interpol_stack.last().unwrap() {
                        Interpolation::MultiLine => {
                            let next_c = self.peek_cur_ch();
                            let aft_next_c = self.peek_next_ch();
                            if next_c.is_none() {
                                self.interpol_stack.pop();
                                let token = self.emit_token(Illegal, &s);
                                return Err(Self::unclosed_string_error(
                                    token,
                                    "\"\"\"",
                                    line!() as usize,
                                ));
                            }
                            if aft_next_c.is_none() {
                                self.interpol_stack.pop();
                                s.push(self.consume().unwrap());
                                let token = self.emit_token(Illegal, &s);
                                return Err(Self::unclosed_string_error(
                                    token,
                                    "\"\"\"",
                                    line!() as usize,
                                ));
                            }
                            if next_c.unwrap() == '"' && aft_next_c.unwrap() == '"' {
                                self.interpol_stack.pop();
                                self.consume().unwrap();
                                self.consume().unwrap();
                                s.push_str("\"\"\"");
                                let token = self.emit_token(StrInterpRight, &s);
                                return Ok(token);
                            }
                            // else unclosed_string_error
                        }
                        Interpolation::SingleLine => {
                            self.interpol_stack.pop();
                            let token = self.emit_token(StrInterpRight, &s);
                            return Ok(token);
                        }
                        Interpolation::Not => {}
                    }
                }
                _ => {
                    let c = self.consume().unwrap();
                    if c == '\\' {
                        let next_c = self.consume().unwrap();
                        match next_c {
                            '{' => {
                                s.push_str("\\{");
                                let token = self.emit_token(StrInterpMid, &s);
                                return Ok(token);
                            }
                            '0' => s.push('\0'),
                            'r' => s.push('\r'),
                            'n' => s.push('\n'),
                            '\'' => s.push('\''),
                            '"' => s.push('"'),
                            't' => s.push_str("    "), // tab is invalid, so changed into 4 whitespace
                            '\\' => s.push('\\'),
                            _ => {
                                let token = self.emit_token(Illegal, &format!("\\{next_c}"));
                                return Err(Self::invalid_escape_error(next_c, token));
                            }
                        }
                    } else {
                        s.push(c);
                        if Self::is_bidi(c) {
                            return Err(self._invalid_unicode_character(&s));
                        }
                    }
                }
            }
        }
        let token = self.emit_token(Illegal, &s);
        Err(Self::unclosed_string_error(token, "", line!() as usize))
    }
    fn lex_raw_ident(&mut self) -> LexResult<Token> {
@ -878,7 +1044,13 @@ impl Iterator for Lexer /*<'a>*/ {
            Some('[') => self.accept(LSqBr, "["),
            Some(']') => self.accept(RSqBr, "]"),
            Some('{') => self.accept(LBrace, "{"),
-            Some('}') => self.accept(RBrace, "}"),
+            Some('}') => {
                if self.interpol_stack.last().unwrap().is_in() {
                    Some(self.lex_interpolation_mid())
                } else {
                    self.accept(RBrace, "}")
                }
            }
            Some('<') => match self.peek_cur_ch() {
                Some('.') => {
                    self.consume();
@ -1145,7 +1317,7 @@ impl Iterator for Lexer /*<'a>*/ {
                            let token = self.emit_token(StrLit, "\"\"");
                            Some(Ok(token))
                        } else {
-                            Some(self.lex_str())
+                            Some(self.lex_single_str())
                        }
                    }
                    (Some(c), Some(next_c)) => {
@ -1154,7 +1326,7 @@ impl Iterator for Lexer /*<'a>*/ {
                            self.consume(); // consume third '"'
                            Some(self.lex_multi_line_str())
                        } else {
-                            Some(self.lex_str())
+                            Some(self.lex_single_str())
                        }
                    }
                }
--- a/compiler/erg_parser/parse.rs
+++ b/compiler/erg_parser/parse.rs
@ -548,6 +548,7 @@ impl Parser {
        match self.peek() {
            Some(t)
                if t.category_is(TC::Literal)
                    || t.is(StrInterpLeft)
                    || t.is(Symbol)
                    || t.category_is(TC::UnaryOp)
                    || t.is(LParen)
@ -1442,6 +1443,13 @@ impl Parser {
                self.level -= 1;
                Ok(Expr::Lit(lit))
            }
            Some(t) if t.is(StrInterpLeft) => {
                let str_interp = self
                    .try_reduce_string_interpolation()
                    .map_err(|_| self.stack_dec())?;
                self.level -= 1;
                Ok(str_interp)
            }
            Some(t) if t.is(AtSign) => {
                let decos = self.opt_reduce_decorators()?;
                let expr = self.try_reduce_chunk(false, in_brace)?;
@ -2161,6 +2169,87 @@ impl Parser {
        }
    }
    /// "...\{, expr, }..." ==> "..." + str(expr) + "..."
    /// "...\{, expr, }..." ==> "..." + str(expr) + "..."
    fn try_reduce_string_interpolation(&mut self) -> ParseResult<Expr> {
        debug_call_info!(self);
        let mut left = self.lpop();
        left.content = Str::from(left.content.trim_end_matches("\\{").to_string() + "\"");
        left.kind = StrLit;
        let mut expr = Expr::Lit(Literal::from(left));
        loop {
            match self.peek() {
                Some(l) if l.is(StrInterpRight) => {
                    let mut right = self.lpop();
                    right.content =
                        Str::from(format!("\"{}", right.content.trim_start_matches('}')));
                    right.kind = StrLit;
                    let right = Expr::Lit(Literal::from(right));
                    let op = Token::new(
                        Plus,
                        "+",
                        right.ln_begin().unwrap(),
                        right.col_begin().unwrap(),
                    );
                    expr = Expr::BinOp(BinOp::new(op, expr, right));
                    self.level -= 1;
                    return Ok(expr);
                }
                Some(_) => {
                    let mid_expr = self.try_reduce_expr(true, false, false, false)?;
                    let str_func = Expr::local(
                        "str",
                        mid_expr.ln_begin().unwrap(),
                        mid_expr.col_begin().unwrap(),
                    );
                    let call = Call::new(
                        str_func,
                        None,
                        Args::new(vec![PosArg::new(mid_expr)], vec![], None),
                    );
                    let op = Token::new(
                        Plus,
                        "+",
                        call.ln_begin().unwrap(),
                        call.col_begin().unwrap(),
                    );
                    let bin = BinOp::new(op, expr, Expr::Call(call));
                    expr = Expr::BinOp(bin);
                    if self.cur_is(StrInterpMid) {
                        let mut mid = self.lpop();
                        mid.content = Str::from(format!(
                            "\"{}\"",
                            mid.content.trim_start_matches('}').trim_end_matches("\\{")
                        ));
                        mid.kind = StrLit;
                        let mid = Expr::Lit(Literal::from(mid));
                        let op = Token::new(
                            Plus,
                            "+",
                            mid.ln_begin().unwrap(),
                            mid.col_begin().unwrap(),
                        );
                        expr = Expr::BinOp(BinOp::new(op, expr, mid));
                    }
                }
                None => {
                    self.level -= 1;
                    let err = ParseError::syntax_error(
                        line!() as usize,
                        expr.loc(),
                        switch_lang!(
                            "japanese" => "文字列補間の終わりが見つかりませんでした",
                            "english" => "end of string interpolation not found",
                        ),
                        None,
                    );
                    self.errs.push(err);
                    return Err(());
                }
            }
        }
    }
    /// x |> f() => f(x)
    fn try_reduce_stream_operator(&mut self, stack: &mut Vec<ExprOrOp>) -> ParseResult<()> {
        debug_call_info!(self);
--- a/examples/helloworld.er
+++ b/examples/helloworld.er
@ -3,5 +3,6 @@ print! "こんにちは、世界！"
 print! "Γειά σου Κόσμε!"
 print! "!مرحبا بالعالم"
-greeting = "Hello"
+greetings = ["Good morning", "Hello", "Good evening"]
-print! "{greeting}, world!"
+for! greetings, greeting =>
    print! "\{greeting}, world!"