diff --git a/cli/src/repl/eval.rs b/cli/src/repl/eval.rs index 9bb8ecfea6..60110d48b1 100644 --- a/cli/src/repl/eval.rs +++ b/cli/src/repl/eval.rs @@ -6,7 +6,7 @@ use roc_module::ident::{Lowercase, TagName}; use roc_module::operator::CalledVia; use roc_module::symbol::{Interns, ModuleId, Symbol}; use roc_mono::layout::{Builtin, Layout}; -use roc_parse::ast::{AssignedField, Expr}; +use roc_parse::ast::{AssignedField, Expr, StrLiteral}; use roc_region::all::{Located, Region}; use roc_types::subs::{Content, FlatType, Subs, Variable}; use roc_types::types::RecordField; @@ -90,7 +90,7 @@ fn jit_to_ast_help<'a>( execution_engine, main_fn_name, &'static str, - |string: &'static str| { Expr::Str(env.arena.alloc(string)) } + |string: &'static str| { str_slice_to_ast(env.arena, env.arena.alloc(string)) } ), Layout::Builtin(Builtin::EmptyList) => { jit_map!(execution_engine, main_fn_name, &'static str, |_| { @@ -168,11 +168,11 @@ fn ptr_to_ast<'a>( list_to_ast(env, ptr, len, elem_layout, content) } - Layout::Builtin(Builtin::EmptyStr) => Expr::Str(""), + Layout::Builtin(Builtin::EmptyStr) => Expr::Str(StrLiteral::PlainLine("")), Layout::Builtin(Builtin::Str) => { let arena_str = unsafe { *(ptr as *const &'static str) }; - Expr::Str(arena_str) + str_slice_to_ast(env.arena, arena_str) } Layout::Struct(field_layouts) => match content { Content::Structure(FlatType::Record(fields, _)) => { @@ -405,3 +405,10 @@ fn i64_to_ast(arena: &Bump, num: i64) -> Expr<'_> { fn f64_to_ast(arena: &Bump, num: f64) -> Expr<'_> { Expr::Num(arena.alloc(format!("{}", num))) } + +fn str_slice_to_ast<'a>(_arena: &'a Bump, string: &'a str) -> Expr<'a> { + todo!( + "if this string contains newlines, render it as a multiline string: {:?}", + Expr::Str(StrLiteral::PlainLine(string)) + ); +} diff --git a/cli/tests/repl_eval.rs b/cli/tests/repl_eval.rs index d02c18f710..80e6710405 100644 --- a/cli/tests/repl_eval.rs +++ b/cli/tests/repl_eval.rs @@ -232,6 +232,12 @@ mod repl_eval { ); } + #[test] + fn multiline_string() { + // If a string contains newlines, format it as a multiline string in the output + expect_success(r#""\n\nhi!\n\n""#, "\"\"\"\n\nhi!\n\n\"\"\""); + } + // TODO uncomment this once https://github.com/rtfeldman/roc/issues/295 is done // // #[test] diff --git a/compiler/can/src/expr.rs b/compiler/can/src/expr.rs index 7582abdab5..7b76de6a9c 100644 --- a/compiler/can/src/expr.rs +++ b/compiler/can/src/expr.rs @@ -14,7 +14,7 @@ use roc_module::ident::{Lowercase, TagName}; use roc_module::low_level::LowLevel; use roc_module::operator::CalledVia; use roc_module::symbol::Symbol; -use roc_parse::ast; +use roc_parse::ast::{self, StrLiteral, StrSegment}; use roc_parse::pattern::PatternType::*; use roc_problem::can::{PrecedenceProblem, Problem, RuntimeError}; use roc_region::all::{Located, Region}; @@ -55,8 +55,10 @@ pub enum Expr { // Int and Float store a variable to generate better error messages Int(Variable, i64), Float(Variable, f64), - Str(Box), - BlockStr(Box), + Str { + interpolations: Vec<(Box, Symbol)>, + suffix: Box, + }, List { list_var: Variable, // required for uniqueness of the list elem_var: Variable, @@ -247,12 +249,7 @@ pub fn canonicalize_expr<'a>( ) } } - ast::Expr::Str(string) => (Str((*string).into()), Output::default()), - ast::Expr::BlockStr(lines) => { - let joined = lines.iter().copied().collect::>().join("\n"); - - (BlockStr(joined.into()), Output::default()) - } + ast::Expr::Str(literal) => flatten_str_literal(env, scope, literal), ast::Expr::List(loc_elems) => { if loc_elems.is_empty() { ( @@ -1045,8 +1042,7 @@ pub fn inline_calls(var_store: &mut VarStore, scope: &mut Scope, expr: Expr) -> other @ Num(_, _) | other @ Int(_, _) | other @ Float(_, _) - | other @ Str(_) - | other @ BlockStr(_) + | other @ Str { .. } | other @ RuntimeError(_) | other @ EmptyRecord | other @ Accessor { .. } @@ -1323,3 +1319,78 @@ pub fn inline_calls(var_store: &mut VarStore, scope: &mut Scope, expr: Expr) -> } } } + +fn flatten_str_literal( + env: &mut Env<'_>, + scope: &mut Scope, + literal: &StrLiteral<'_>, +) -> (Expr, Output) { + use ast::StrLiteral::*; + + match literal { + PlainLine(str_slice) => ( + Expr::Str { + interpolations: Vec::new(), + suffix: (*str_slice).into(), + }, + Output::default(), + ), + LineWithEscapes(segments) => flatten_str_lines(env, scope, &[segments]), + Block(lines) => flatten_str_lines(env, scope, lines), + } +} + +fn flatten_str_lines( + env: &mut Env<'_>, + scope: &mut Scope, + lines: &[&[StrSegment<'_>]], +) -> (Expr, Output) { + use StrSegment::*; + + let mut buf = String::new(); + let mut interpolations = Vec::new(); + let mut output = Output::default(); + + for line in lines { + for segment in line.iter() { + match segment { + Plaintext(string) => { + buf.push_str(string); + } + Unicode(loc_digits) => { + todo!("parse unicode digits {:?}", loc_digits); + } + Interpolated { + module_name, + ident, + region, + } => { + let (expr, new_output) = + canonicalize_lookup(env, scope, module_name, ident, region.clone()); + + output.union(new_output); + + match expr { + Expr::Var(symbol) => { + interpolations.push((buf.into(), symbol)); + } + _ => { + todo!("TODO gracefully handle non-ident in string interpolation."); + } + } + + buf = String::new(); + } + EscapedChar(ch) => buf.push(*ch), + } + } + } + + ( + Expr::Str { + interpolations, + suffix: buf.into(), + }, + output, + ) +} diff --git a/compiler/can/src/operator.rs b/compiler/can/src/operator.rs index 8b7e13d70e..9eead9a9eb 100644 --- a/compiler/can/src/operator.rs +++ b/compiler/can/src/operator.rs @@ -68,8 +68,6 @@ pub fn desugar_expr<'a>(arena: &'a Bump, loc_expr: &'a Located>) -> &'a | Nested(NonBase10Int { .. }) | Str(_) | Nested(Str(_)) - | BlockStr(_) - | Nested(BlockStr(_)) | AccessorFunction(_) | Nested(AccessorFunction(_)) | Var { .. } diff --git a/compiler/can/src/pattern.rs b/compiler/can/src/pattern.rs index eef2b0532e..9bebbeed38 100644 --- a/compiler/can/src/pattern.rs +++ b/compiler/can/src/pattern.rs @@ -4,7 +4,7 @@ use crate::num::{finish_parsing_base, finish_parsing_float, finish_parsing_int}; use crate::scope::Scope; use roc_module::ident::{Ident, Lowercase, TagName}; use roc_module::symbol::Symbol; -use roc_parse::ast; +use roc_parse::ast::{self, StrLiteral, StrSegment}; use roc_parse::pattern::PatternType; use roc_problem::can::{MalformedPatternProblem, Problem, RuntimeError}; use roc_region::all::{Located, Region}; @@ -230,16 +230,8 @@ pub fn canonicalize_pattern<'a>( ptype => unsupported_pattern(env, ptype, region), }, - StrLiteral(string) => match pattern_type { - WhenBranch => { - // TODO report whether string was malformed - Pattern::StrLiteral((*string).into()) - } - ptype => unsupported_pattern(env, ptype, region), - }, - - BlockStrLiteral(_lines) => match pattern_type { - WhenBranch => todo!("TODO block string literal pattern"), + StrLiteral(literal) => match pattern_type { + WhenBranch => flatten_str_literal(literal), ptype => unsupported_pattern(env, ptype, region), }, @@ -473,3 +465,38 @@ fn add_bindings_from_patterns( | UnsupportedPattern(_) => (), } } + +fn flatten_str_literal(literal: &StrLiteral<'_>) -> Pattern { + use ast::StrLiteral::*; + + match literal { + PlainLine(str_slice) => Pattern::StrLiteral((*str_slice).into()), + LineWithEscapes(segments) => flatten_str_lines(&[segments]), + Block(lines) => flatten_str_lines(lines), + } +} + +fn flatten_str_lines(lines: &[&[StrSegment<'_>]]) -> Pattern { + use StrSegment::*; + + let mut buf = String::new(); + + for line in lines { + for segment in line.iter() { + match segment { + Plaintext(string) => { + buf.push_str(string); + } + Unicode(loc_digits) => { + todo!("parse unicode digits {:?}", loc_digits); + } + Interpolated { region, .. } => { + return Pattern::UnsupportedPattern(region.clone()); + } + EscapedChar(ch) => buf.push(*ch), + } + } + } + + Pattern::StrLiteral(buf.into()) +} diff --git a/compiler/can/tests/test_can.rs b/compiler/can/tests/test_can.rs index 2b6e4ce908..4dec7f0f77 100644 --- a/compiler/can/tests/test_can.rs +++ b/compiler/can/tests/test_can.rs @@ -1236,104 +1236,112 @@ mod test_can { // ); // } - // #[test] - // fn string_with_interpolation_at_start() { - // let input = indoc!( - // r#" - // "\(abc)defg" - // "# - // ); - // let (args, ret) = (vec![("", Located::new(0, 2, 0, 4, Var("abc")))], "defg"); - // let arena = Bump::new(); - // let actual = parse_with(&arena, input); + #[test] + fn string_with_interpolation_at_start() { + let src = indoc!( + r#" + "\(abc)defg" + "# + ); + let arena = Bump::new(); + let CanExprOut { + loc_expr, problems, .. + } = can_expr_with(&arena, test_home(), src); + assert_eq!(problems, Vec::new()); + // let (args, ret) = (vec![("", Located::new(0, 2, 0, 4, Var("abc")))], "defg"); + // let arena = Bump::new(); + // let actual = parse_with(&arena, input); - // assert_eq!( - // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), - // actual - // ); - // } + // assert_eq!( + // Ok(Expr::InterpolatedStr(&( + // arena.alloc_slice_clone(&args), + // ret + // ))), + // actual + // ); + } - // #[test] - // fn string_with_interpolation_at_end() { - // let input = indoc!( - // r#" - // "abcd\(efg)" - // "# - // ); - // let (args, ret) = (vec![("abcd", Located::new(0, 6, 0, 8, Var("efg")))], ""); - // let arena = Bump::new(); - // let actual = parse_with(&arena, input); + #[test] + fn string_with_interpolation_at_end() { + let src = indoc!( + r#" + "abcd\(efg)" + "# + ); + // let (args, ret) = (vec![("abcd", Located::new(0, 6, 0, 8, Var("efg")))], ""); + // let arena = Bump::new(); + // let actual = parse_with(&arena, input); - // assert_eq!( - // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), - // actual - // ); - // } + // assert_eq!( + // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), + // actual + // ); + } - // #[test] - // fn string_with_interpolation_in_middle() { - // let input = indoc!( - // r#" - // "abc\(defg)hij" - // "# - // ); - // let (args, ret) = (vec![("abc", Located::new(0, 5, 0, 8, Var("defg")))], "hij"); - // let arena = Bump::new(); - // let actual = parse_with(&arena, input); + #[test] + fn string_with_interpolation_in_middle() { + let src = indoc!( + r#" + "abc\(defg)hij" + "# + ); + // let (args, ret) = (vec![("abc", Located::new(0, 5, 0, 8, Var("defg")))], "hij"); + // let arena = Bump::new(); + // let actual = parse_with(&arena, input); - // assert_eq!( - // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), - // actual - // ); - // } + // assert_eq!( + // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), + // actual + // ); + } - // #[test] - // fn string_with_two_interpolations_in_middle() { - // let input = indoc!( - // r#" - // "abc\(defg)hi\(jkl)mn" - // "# - // ); - // let (args, ret) = ( - // vec![ - // ("abc", Located::new(0, 5, 0, 8, Var("defg"))), - // ("hi", Located::new(0, 14, 0, 16, Var("jkl"))), - // ], - // "mn", - // ); - // let arena = Bump::new(); - // let actual = parse_with(&arena, input); + #[test] + fn string_with_two_interpolations_in_middle() { + let src = indoc!( + r#" + "abc\(defg)hi\(jkl)mn" + "# + ); + // let (args, ret) = ( + // vec![ + // ("abc", Located::new(0, 5, 0, 8, Var("defg"))), + // ("hi", Located::new(0, 14, 0, 16, Var("jkl"))), + // ], + // "mn", + // ); + // let arena = Bump::new(); + // let actual = parse_with(&arena, input); - // assert_eq!( - // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), - // actual - // ); - // } + // assert_eq!( + // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), + // actual + // ); + } - // #[test] - // fn string_with_four_interpolations() { - // let input = indoc!( - // r#" - // "\(abc)def\(ghi)jkl\(mno)pqrs\(tuv)" - // "# - // ); - // let (args, ret) = ( - // vec![ - // ("", Located::new(0, 2, 0, 4, Var("abc"))), - // ("def", Located::new(0, 11, 0, 13, Var("ghi"))), - // ("jkl", Located::new(0, 20, 0, 22, Var("mno"))), - // ("pqrs", Located::new(0, 30, 0, 32, Var("tuv"))), - // ], - // "", - // ); - // let arena = Bump::new(); - // let actual = parse_with(&arena, input); + #[test] + fn string_with_four_interpolations() { + let src = indoc!( + r#" + "\(abc)def\(ghi)jkl\(mno)pqrs\(tuv)" + "# + ); + // let (args, ret) = ( + // vec![ + // ("", Located::new(0, 2, 0, 4, Var("abc"))), + // ("def", Located::new(0, 11, 0, 13, Var("ghi"))), + // ("jkl", Located::new(0, 20, 0, 22, Var("mno"))), + // ("pqrs", Located::new(0, 30, 0, 32, Var("tuv"))), + // ], + // "", + // ); + // let arena = Bump::new(); + // let actual = parse_with(&arena, input); - // assert_eq!( - // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), - // actual - // ); - // } + // assert_eq!( + // Ok(InterpolatedStr(&(arena.alloc_slice_clone(&args), ret))), + // actual + // ); + } // #[test] // fn string_with_escaped_interpolation() { @@ -1384,4 +1392,6 @@ mod test_can { // TODO test hex/oct/binary conversion to numbers // // TODO test for \t \r and \n in string literals *outside* unicode escape sequence! + // + // TODO test for multiline block string literals in pattern matches } diff --git a/compiler/constrain/src/expr.rs b/compiler/constrain/src/expr.rs index f7746c676f..dc4acbdc05 100644 --- a/compiler/constrain/src/expr.rs +++ b/compiler/constrain/src/expr.rs @@ -1,4 +1,4 @@ -use crate::builtins::{empty_list_type, float_literal, int_literal, list_type, str_type}; +use crate::builtins::{empty_list_type, float_literal, int_literal, list_type}; use crate::pattern::{constrain_pattern, PatternState}; use roc_can::annotation::IntroducedVariables; use roc_can::constraint::Constraint::{self, *}; @@ -199,7 +199,15 @@ pub fn constrain_expr( exists(vars, And(cons)) } - Str(_) | BlockStr(_) => Eq(str_type(), expected, Category::Str, region), + Str { interpolations, .. } => { + todo!( + "constrain interpolations in a string literal {:?}", + interpolations + ); + + // use crate::builtins::{empty_list_type, float_literal, int_literal, list_type, str_type}; + // Eq(str_type(), expected, Category::Str, region) + } List { elem_var, loc_elems, diff --git a/compiler/constrain/src/uniq.rs b/compiler/constrain/src/uniq.rs index 3728b7eb35..ec2ca083b3 100644 --- a/compiler/constrain/src/uniq.rs +++ b/compiler/constrain/src/uniq.rs @@ -503,14 +503,15 @@ pub fn constrain_expr( ]), ) } - BlockStr(_) | Str(_) => { - let uniq_type = var_store.fresh(); - let inferred = str_type(Bool::variable(uniq_type)); + Str { interpolations, .. } => { + todo!("uniq constrain interpolations {:?}", interpolations); + // let uniq_type = var_store.fresh(); + // let inferred = str_type(Bool::variable(uniq_type)); - exists( - vec![uniq_type], - Eq(inferred, expected, Category::Str, region), - ) + // exists( + // vec![uniq_type], + // Eq(inferred, expected, Category::Str, region), + // ) } EmptyRecord => { let uniq_type = var_store.fresh(); diff --git a/compiler/fmt/src/expr.rs b/compiler/fmt/src/expr.rs index 700d00acbe..d0035a4167 100644 --- a/compiler/fmt/src/expr.rs +++ b/compiler/fmt/src/expr.rs @@ -28,7 +28,6 @@ impl<'a> Formattable<'a> for Expr<'a> { Float(_) | Num(_) | NonBase10Int { .. } - | Str(_) | Access(_, _) | AccessorFunction(_) | Var { .. } @@ -42,7 +41,13 @@ impl<'a> Formattable<'a> for Expr<'a> { List(elems) => elems.iter().any(|loc_expr| loc_expr.is_multiline()), - BlockStr(lines) => lines.len() > 1, + Str(literal) => { + todo!( + "fmt determine if string literal is multiline: {:?}", + literal + ); + // lines.len() > 1 + } Apply(loc_expr, args, _) => { loc_expr.is_multiline() || args.iter().any(|loc_arg| loc_arg.is_multiline()) } @@ -112,10 +117,19 @@ impl<'a> Formattable<'a> for Expr<'a> { sub_expr.format_with_options(buf, Parens::NotNeeded, Newlines::Yes, indent); buf.push(')'); } - Str(string) => { - buf.push('"'); - buf.push_str(string); - buf.push('"'); + Str(literal) => { + todo!("fmt string literal {:?}", literal); + // buf.push('"'); + // buf.push_str(string); + // buf.push('"'); + // + // BlockStr(lines) => { + // buf.push_str("\"\"\""); + // for line in lines.iter() { + // buf.push_str(line); + // } + // buf.push_str("\"\"\""); + // } } Var { module_name, ident } => { if !module_name.is_empty() { @@ -152,13 +166,6 @@ impl<'a> Formattable<'a> for Expr<'a> { buf.push(')'); } } - BlockStr(lines) => { - buf.push_str("\"\"\""); - for line in lines.iter() { - buf.push_str(line); - } - buf.push_str("\"\"\""); - } Num(string) | Float(string) | GlobalTag(string) | PrivateTag(string) => { buf.push_str(string) } diff --git a/compiler/fmt/src/pattern.rs b/compiler/fmt/src/pattern.rs index 361a4ac9df..af087b8a3d 100644 --- a/compiler/fmt/src/pattern.rs +++ b/compiler/fmt/src/pattern.rs @@ -37,7 +37,6 @@ impl<'a> Formattable<'a> for Pattern<'a> { | Pattern::NonBase10Literal { .. } | Pattern::FloatLiteral(_) | Pattern::StrLiteral(_) - | Pattern::BlockStrLiteral(_) | Pattern::Underscore | Pattern::Malformed(_) | Pattern::QualifiedIdentifier { .. } => false, @@ -126,11 +125,8 @@ impl<'a> Formattable<'a> for Pattern<'a> { buf.push_str(string); } FloatLiteral(string) => buf.push_str(string), - StrLiteral(string) => buf.push_str(string), - BlockStrLiteral(lines) => { - for line in *lines { - buf.push_str(line) - } + StrLiteral(literal) => { + todo!("Format string literal: {:?}", literal); } Underscore => buf.push('_'), diff --git a/compiler/gen/src/llvm/build.rs b/compiler/gen/src/llvm/build.rs index 34f512406a..5997ce7c29 100644 --- a/compiler/gen/src/llvm/build.rs +++ b/compiler/gen/src/llvm/build.rs @@ -222,10 +222,22 @@ pub fn build_exp_literal<'a, 'ctx, 'env>( Float(num) => env.context.f64_type().const_float(*num).into(), Bool(b) => env.context.bool_type().const_int(*b as u64, false).into(), Byte(b) => env.context.i8_type().const_int(*b as u64, false).into(), - Str(str_literal) => { - if str_literal.is_empty() { + Str { + interpolations, + suffix, + } => { + if interpolations.is_empty() && suffix.is_empty() { empty_list(env) } else { + if !interpolations.is_empty() { + todo!( + "LLVM code gen for string interpolations: {:?}", + interpolations + ); + } + + let mut str_literal = suffix; // TODO REMOVE THIS + let ctx = env.context; let builder = env.builder; diff --git a/compiler/mono/src/decision_tree.rs b/compiler/mono/src/decision_tree.rs index a3f637efb7..83eca10e2a 100644 --- a/compiler/mono/src/decision_tree.rs +++ b/compiler/mono/src/decision_tree.rs @@ -1095,7 +1095,10 @@ fn test_to_equality<'a>( } Test::IsStr(test_str) => { - let lhs = Expr::Literal(Literal::Str(env.arena.alloc(test_str))); + let lhs = Expr::Literal(Literal::Str { + interpolations: &[], + suffix: env.arena.alloc(test_str), + }); let lhs_symbol = env.unique_symbol(); let (mut stores, rhs_symbol) = path_to_expr(env, cond_symbol, &path, &cond_layout); diff --git a/compiler/mono/src/ir.rs b/compiler/mono/src/ir.rs index 3df0b1334b..5f94a79168 100644 --- a/compiler/mono/src/ir.rs +++ b/compiler/mono/src/ir.rs @@ -590,7 +590,10 @@ pub enum Literal<'a> { // Literals Int(i64), Float(f64), - Str(&'a str), + Str { + interpolations: &'a [(&'a str, Symbol)], + suffix: &'a str, + }, /// Closed tag unions containing exactly two (0-arity) tags compile to Expr::Bool, /// so they can (at least potentially) be emitted as 1-bit machine bools. /// @@ -669,7 +672,13 @@ impl<'a> Literal<'a> { Float(lit) => alloc.text(format!("{}f64", lit)), Bool(lit) => alloc.text(format!("{}", lit)), Byte(lit) => alloc.text(format!("{}u8", lit)), - Str(lit) => alloc.text(format!("{:?}", lit)), + Str { + interpolations, + suffix, + } => { + // alloc.text(format!("{:?}", lit)) + todo!("Literal::to_doc for Str"); + } } } } @@ -1242,12 +1251,18 @@ pub fn with_hole<'a>( hole, ), - Str(string) | BlockStr(string) => Stmt::Let( - assigned, - Expr::Literal(Literal::Str(arena.alloc(string))), - Layout::Builtin(Builtin::Str), - hole, - ), + Str { + interpolations, + suffix: _, + } => { + todo!("mono IR to turn Str interpolations into Let"); + // Stmt::Let( + // assigned, + // Expr::Literal(Literal::Str(arena.alloc(string))), + // Layout::Builtin(Builtin::Str), + // hole, + // ) + } Num(var, num) => match num_argument_to_int_or_float(env.subs, var) { IntOrFloat::IntType => Stmt::Let( diff --git a/compiler/parse/src/ast.rs b/compiler/parse/src/ast.rs index 681a214e5e..cf4854a575 100644 --- a/compiler/parse/src/ast.rs +++ b/compiler/parse/src/ast.rs @@ -84,6 +84,26 @@ pub struct WhenPattern<'a> { pub guard: Option>>, } +#[derive(Clone, Debug, PartialEq)] +pub enum StrSegment<'a> { + Plaintext(&'a str), // e.g. "foo" + Unicode(Loc<&'a str>), // e.g. "00A0" in "\u(00A0)" + EscapedChar(char), // e.g. '\n' in "Hello!\n" + Interpolated { + // e.g. "App.version" in "Version: \(App.version)" + module_name: &'a str, + ident: &'a str, + region: Region, + }, +} + +#[derive(Clone, Debug, PartialEq)] +pub enum StrLiteral<'a> { + PlainLine(&'a str), + LineWithEscapes(&'a [StrSegment<'a>]), + Block(&'a [&'a [StrSegment<'a>]]), +} + /// A parsed expression. This uses lifetimes extensively for two reasons: /// /// 1. It uses Bump::alloc for all allocations, which returns a reference. @@ -105,8 +125,7 @@ pub enum Expr<'a> { }, // String Literals - Str(&'a str), - BlockStr(&'a [&'a str]), + Str(StrLiteral<'a>), // string without escapes in it /// Look up exactly one field on a record, e.g. (expr).foo. Access(&'a Expr<'a>, &'a str), /// e.g. `.foo` @@ -336,8 +355,7 @@ pub enum Pattern<'a> { is_negative: bool, }, FloatLiteral(&'a str), - StrLiteral(&'a str), - BlockStrLiteral(&'a [&'a str]), + StrLiteral(StrLiteral<'a>), Underscore, // Space @@ -455,7 +473,6 @@ impl<'a> Pattern<'a> { ) => string_x == string_y && base_x == base_y && is_negative_x == is_negative_y, (FloatLiteral(x), FloatLiteral(y)) => x == y, (StrLiteral(x), StrLiteral(y)) => x == y, - (BlockStrLiteral(x), BlockStrLiteral(y)) => x == y, (Underscore, Underscore) => true, // Space @@ -584,7 +601,7 @@ impl<'a> Spaceable<'a> for Def<'a> { pub enum Attempting { List, Keyword, - StringLiteral, + StrLiteral, RecordLiteral, RecordFieldLabel, InterpolatedString, diff --git a/compiler/parse/src/expr.rs b/compiler/parse/src/expr.rs index c701652661..e3baa3d01c 100644 --- a/compiler/parse/src/expr.rs +++ b/compiler/parse/src/expr.rs @@ -300,12 +300,8 @@ fn expr_to_pattern<'a>(arena: &'a Bump, expr: &Expr<'a>) -> Result, base: *base, is_negative: *is_negative, }), - Expr::Str(string) => Ok(Pattern::StrLiteral(string)), - Expr::MalformedIdent(string) => Ok(Pattern::Malformed(string)), - // These would not have parsed as patterns - Expr::BlockStr(_) - | Expr::AccessorFunction(_) + Expr::AccessorFunction(_) | Expr::Access(_, _) | Expr::List(_) | Expr::Closure(_, _) @@ -322,6 +318,9 @@ fn expr_to_pattern<'a>(arena: &'a Bump, expr: &Expr<'a>) -> Result, attempting: Attempting::Def, reason: FailReason::InvalidPattern, }), + + Expr::Str(string) => Ok(Pattern::StrLiteral(string.clone())), + Expr::MalformedIdent(string) => Ok(Pattern::Malformed(string)), } } @@ -580,11 +579,7 @@ fn annotation_or_alias<'a>( QualifiedIdentifier { .. } => { panic!("TODO gracefully handle trying to annotate a qualified identifier, e.g. `Foo.bar : ...`"); } - NumLiteral(_) - | NonBase10Literal { .. } - | FloatLiteral(_) - | StrLiteral(_) - | BlockStrLiteral(_) => { + NumLiteral(_) | NonBase10Literal { .. } | FloatLiteral(_) | StrLiteral(_) => { panic!("TODO gracefully handle trying to annotate a litera"); } Underscore => { @@ -916,10 +911,7 @@ fn number_pattern<'a>() -> impl Parser<'a, Pattern<'a>> { } fn string_pattern<'a>() -> impl Parser<'a, Pattern<'a>> { - map!(crate::string_literal::parse(), |result| match result { - crate::string_literal::StringLiteral::Line(string) => Pattern::StrLiteral(string), - crate::string_literal::StringLiteral::Block(lines) => Pattern::BlockStrLiteral(lines), - }) + map!(crate::string_literal::parse(), Pattern::StrLiteral) } fn underscore_pattern<'a>() -> impl Parser<'a, Pattern<'a>> { @@ -1789,8 +1781,5 @@ pub fn global_tag<'a>() -> impl Parser<'a, &'a str> { } pub fn string_literal<'a>() -> impl Parser<'a, Expr<'a>> { - map!(crate::string_literal::parse(), |result| match result { - crate::string_literal::StringLiteral::Line(string) => Expr::Str(string), - crate::string_literal::StringLiteral::Block(lines) => Expr::BlockStr(lines), - }) + map!(crate::string_literal::parse(), Expr::Str) } diff --git a/compiler/parse/src/string_literal.rs b/compiler/parse/src/string_literal.rs index 882c6295f9..eb72cd5d24 100644 --- a/compiler/parse/src/string_literal.rs +++ b/compiler/parse/src/string_literal.rs @@ -1,71 +1,110 @@ -use crate::ast::Attempting; +use crate::ast::{Attempting, StrLiteral, StrSegment}; use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State}; use bumpalo::collections::vec::Vec; use bumpalo::Bump; -pub enum StringLiteral<'a> { - Line(&'a str), - Block(&'a [&'a str]), -} +pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> { + use StrLiteral::*; -pub fn parse<'a>() -> impl Parser<'a, StringLiteral<'a>> { move |arena: &'a Bump, state: State<'a>| { let mut bytes = state.bytes.iter(); - // String literals must start with a quote. // If this doesn't, it must not be a string literal! match bytes.next() { Some(&byte) => { if byte != b'"' { - return Err(unexpected(0, state, Attempting::StringLiteral)); + return Err(unexpected(0, state, Attempting::StrLiteral)); } } None => { - return Err(unexpected_eof(0, Attempting::StringLiteral, state)); + return Err(unexpected_eof(0, Attempting::StrLiteral, state)); } } + // The current segment begins right after the opening quotation mark. + let mut cur_segment = &state.bytes[1..]; + + enum EscapeState { + None, + Unicode, + Interpolation, + } + // At the parsing stage we keep the entire raw string, because the formatter // needs the raw string. (For example, so it can "remember" whether you // wrote \u{...} or the actual unicode character itself.) // - // Later, in canonicalization, we'll do things like resolving - // unicode escapes and string interpolation. - // // Since we're keeping the entire raw string, all we need to track is // how many characters we've parsed. So far, that's 1 (the opening `"`). - let mut parsed_chars = 1; - let mut prev_byte = b'"'; + let mut total_parsed_chars = 1; + let mut segment_parsed_chars = 0; + let mut segments = Vec::new_in(arena); + let mut escape_state = EscapeState::None; + + // pub enum StrSegment<'a> { + // Plaintext(&'a str), // e.g. "foo" + // Unicode(&'a str), // e.g. "00A0" in "\u(00A0)" + // Interpolated(&'a str), // e.g. "name" in "Hi, \(name)!" + // EscapedChar(char), // e.g. '\n' in "Hello!\n" + // } while let Some(&byte) = bytes.next() { - parsed_chars += 1; + segment_parsed_chars += 1; // Potentially end the string (unless this is an escaped `"`!) match byte { - b'"' if prev_byte != b'\\' => { - let (string, state) = if parsed_chars == 2 { - match bytes.next() { - Some(b'"') => { - // If the first three chars were all `"`, then this - // literal begins with `"""` and is a block string. - return parse_block_string(arena, state, &mut bytes); + b'"' => { + // If we aren't escaping, then this is the end of the string! + if let EscapeState::None = escape_state { + let (literal, state) = if total_parsed_chars == 1 && segments.is_empty() { + match bytes.next() { + Some(b'"') => { + // If the very first three chars were all `"`, + // then this literal begins with `"""` + // and is a block string. + return parse_block_string(arena, state, &mut bytes); + } + _ => (PlainLine(""), state.advance_without_indenting(2)?), } - _ => ("", state.advance_without_indenting(2)?), - } + } else { + // Subtract 1 from parsed_chars so we omit the closing `"`. + let string_bytes = &cur_segment[0..(segment_parsed_chars - 1)]; + + match parse_utf8(string_bytes) { + Ok(string) => { + total_parsed_chars += segment_parsed_chars; + + let state = + state.advance_without_indenting(total_parsed_chars)?; + + if segments.is_empty() { + // We only had one segment. + (StrLiteral::PlainLine(string), state) + } else { + // We had multiple segments! Parse the + // current one and add it to the list. + segments.push(StrSegment::Plaintext(string)); + + (LineWithEscapes(segments.into_bump_slice()), state) + } + } + Err(reason) => { + return state.fail(reason); + } + } + }; + + return Ok((literal, state)); } else { - // Start at 1 so we omit the opening `"`. - // Subtract 1 from parsed_chars so we omit the closing `"`. - let string_bytes = &state.bytes[1..(parsed_chars - 1)]; - - match parse_utf8(string_bytes) { - Ok(string) => (string, state.advance_without_indenting(parsed_chars)?), - Err(reason) => { - return state.fail(reason); - } - } - }; - - return Ok((StringLiteral::Line(string), state)); + // We are escaping, so this is an error. (If it were an + // escaped single character like \" then we would have + // handled that scenario already.) + return Err(unexpected( + state.bytes.len() - 1, + state, + Attempting::StrLiteral, + )); + } } b'\n' => { // This is a single-line string, which cannot have newlines! @@ -76,19 +115,90 @@ pub fn parse<'a>() -> impl Parser<'a, StringLiteral<'a>> { return Err(unexpected( state.bytes.len() - 1, state, - Attempting::StringLiteral, + Attempting::StrLiteral, )); } + b')' => { + // All escape sequences end in a close paren, so we don't + // need to pay for a conditional here. If it was an escape, + // then we want to set it to None, and if it wasn't an + // escape, then setting it from None to None is harmless! + // (And likely cheaper than a conditional.) + escape_state = EscapeState::None; + } + b'\\' => { + // This is the start of a new escape + if let EscapeState::None = escape_state { + match bytes.next() { + Some(b'(') => { + // This is an interpolated variable + escape_state = EscapeState::Interpolation; + todo!("Parse interpolated ident"); + } + Some(b'u') => { + escape_state = EscapeState::Unicode; + // This is an escaped unicode character + todo!("Parse '(' and then parse escaped unicode character"); + } + Some(ch @ b'\n') | Some(ch @ b'\t') | Some(ch @ b'\r') + | Some(ch @ b'"') | Some(ch @ b'\\') => { + // Record the current segment so we can begin a new one. + match parse_utf8(cur_segment) { + Ok(string) => { + segments.push(StrSegment::Plaintext(string)); + } + Err(reason) => { + return state.fail(reason); + } + } + + // Record the escaped char. + segments.push(StrSegment::EscapedChar(*ch as char)); + + // We're now done escaping. + escape_state = EscapeState::None; + + // Advance past the segment we just added, and + // also past the escaped char we just added. + // + // +2 because we just parsed a backslash and + // one other char after it. + cur_segment = &cur_segment[(segment_parsed_chars + 2)..]; + + // Reset segment_parsed_chars to 0 because we're now + // parsing the beginning of a new segment. + segment_parsed_chars = 0; + } + _ => { + // Invalid escape! A backslash must be followed + // by either an open paren or else one of the + // escapable characters (\n, \t, \", \\, etc) + return Err(unexpected( + state.bytes.len() - 1, + state, + Attempting::StrLiteral, + )); + } + } + } else { + // Can't have a \ inside an escape! + return Err(unexpected( + state.bytes.len() - 1, + state, + Attempting::StrLiteral, + )); + } + } _ => { - prev_byte = byte; + // All other characters need no special handling. } } } // We ran out of characters before finding a closed quote Err(unexpected_eof( - parsed_chars, - Attempting::StringLiteral, + total_parsed_chars, + Attempting::StrLiteral, state.clone(), )) } @@ -98,7 +208,7 @@ fn parse_block_string<'a, I>( arena: &'a Bump, state: State<'a>, bytes: &mut I, -) -> ParseResult<'a, StringLiteral<'a>> +) -> ParseResult<'a, StrLiteral<'a>> where I: Iterator, { @@ -125,12 +235,13 @@ where let line_bytes = &state.bytes[line_start..(parsed_chars - 3)]; return match parse_utf8(line_bytes) { - Ok(line) => { - let state = state.advance_without_indenting(parsed_chars)?; + Ok(_line) => { + // let state = state.advance_without_indenting(parsed_chars)?; - lines.push(line); + // lines.push(line); - Ok((StringLiteral::Block(arena.alloc(lines)), state)) + // Ok((StrLiteral::Block(lines.into_bump_slice()), state)) + todo!("TODO finish making block strings accept escapes"); } Err(reason) => state.fail(reason), }; @@ -164,8 +275,8 @@ where // We ran out of characters before finding 3 closing quotes Err(unexpected_eof( parsed_chars, - // TODO custom BlockStringLiteral? - Attempting::StringLiteral, + // TODO custom BlockStrLiteral? + Attempting::StrLiteral, state, )) } diff --git a/compiler/parse/tests/test_parse.rs b/compiler/parse/tests/test_parse.rs index cb97c42bc0..6478c24bef 100644 --- a/compiler/parse/tests/test_parse.rs +++ b/compiler/parse/tests/test_parse.rs @@ -24,6 +24,7 @@ mod test_parse { use roc_parse::ast::CommentOrNewline::*; use roc_parse::ast::Expr::{self, *}; use roc_parse::ast::Pattern::{self, *}; + use roc_parse::ast::StrLiteral::*; use roc_parse::ast::{ Attempting, Def, InterfaceHeader, Spaceable, Tag, TypeAnnotation, WhenBranch, }; @@ -51,7 +52,7 @@ mod test_parse { // STRING LITERALS fn expect_parsed_str(input: &str, expected: &str) { - assert_parses_to(expected, Str(input.into())); + assert_parses_to(expected, Expr::Str(PlainLine(input))); } #[test] @@ -62,7 +63,7 @@ mod test_parse { "" "# ), - Str(""), + Str(PlainLine("")), ); } @@ -71,10 +72,10 @@ mod test_parse { assert_parses_to( indoc!( r#" - "x" + "x" "# ), - Str("x".into()), + Expr::Str(PlainLine("x".into())), ); } @@ -83,10 +84,10 @@ mod test_parse { assert_parses_to( indoc!( r#" - "foo" + "foo" "# ), - Str("foo".into()), + Expr::Str(PlainLine("foo".into())), ); } @@ -1859,8 +1860,10 @@ mod test_parse { fn two_branch_when() { let arena = Bump::new(); let newlines = bumpalo::vec![in &arena; Newline]; - let pattern1 = - Pattern::SpaceBefore(arena.alloc(StrLiteral("blah")), newlines.into_bump_slice()); + let pattern1 = Pattern::SpaceBefore( + arena.alloc(StrLiteral(PlainLine("blah"))), + newlines.into_bump_slice(), + ); let loc_pattern1 = Located::new(1, 1, 1, 7, pattern1); let expr1 = Num("1"); let loc_expr1 = Located::new(1, 1, 11, 12, expr1); @@ -1870,8 +1873,10 @@ mod test_parse { guard: None, }); let newlines = bumpalo::vec![in &arena; Newline]; - let pattern2 = - Pattern::SpaceBefore(arena.alloc(StrLiteral("mise")), newlines.into_bump_slice()); + let pattern2 = Pattern::SpaceBefore( + arena.alloc(StrLiteral(PlainLine("mise"))), + newlines.into_bump_slice(), + ); let loc_pattern2 = Located::new(2, 2, 1, 7, pattern2); let expr2 = Num("2"); let loc_expr2 = Located::new(2, 2, 11, 12, expr2); @@ -1891,9 +1896,9 @@ mod test_parse { &arena, indoc!( r#" - when x is - "blah" -> 1 - "mise" -> 2 + when x is + "blah" -> 1 + "mise" -> 2 "# ), ); @@ -2003,9 +2008,11 @@ mod test_parse { fn when_with_alternative_patterns() { let arena = Bump::new(); let newlines = bumpalo::vec![in &arena; Newline]; - let pattern1 = - Pattern::SpaceBefore(arena.alloc(StrLiteral("blah")), newlines.into_bump_slice()); - let pattern1_alt = StrLiteral("blop"); + let pattern1 = Pattern::SpaceBefore( + arena.alloc(StrLiteral(PlainLine("blah"))), + newlines.into_bump_slice(), + ); + let pattern1_alt = StrLiteral(PlainLine("blop")); let loc_pattern1 = Located::new(1, 1, 1, 7, pattern1); let loc_pattern1_alt = Located::new(1, 1, 10, 16, pattern1_alt); let expr1 = Num("1"); @@ -2016,11 +2023,15 @@ mod test_parse { guard: None, }); let newlines = bumpalo::vec![in &arena; Newline]; - let pattern2 = - Pattern::SpaceBefore(arena.alloc(StrLiteral("foo")), newlines.into_bump_slice()); + let pattern2 = Pattern::SpaceBefore( + arena.alloc(StrLiteral(PlainLine("foo"))), + newlines.into_bump_slice(), + ); let newlines = bumpalo::vec![in &arena; Newline]; - let pattern2_alt = - Pattern::SpaceBefore(arena.alloc(StrLiteral("bar")), newlines.into_bump_slice()); + let pattern2_alt = Pattern::SpaceBefore( + arena.alloc(StrLiteral(PlainLine("bar"))), + newlines.into_bump_slice(), + ); let loc_pattern2 = Located::new(2, 2, 1, 6, pattern2); let loc_pattern2_alt = Located::new(3, 3, 1, 6, pattern2_alt); let expr2 = Num("2"); @@ -2133,14 +2144,14 @@ mod test_parse { let def2 = SpaceAfter( arena.alloc(Body( arena.alloc(Located::new(2, 2, 0, 3, pattern2)), - arena.alloc(Located::new(2, 2, 6, 10, Str("hi"))), + arena.alloc(Located::new(2, 2, 6, 10, Str(PlainLine("hi")))), )), newlines2.into_bump_slice(), ); let def3 = SpaceAfter( arena.alloc(Body( arena.alloc(Located::new(3, 3, 0, 3, pattern3)), - arena.alloc(Located::new(3, 3, 6, 13, Str("stuff"))), + arena.alloc(Located::new(3, 3, 6, 13, Str(PlainLine("stuff")))), )), newlines3.into_bump_slice(), ); @@ -2426,7 +2437,7 @@ mod test_parse { // ) // "# // ), - // Str(""), + // Str(PlainLine("")), // ); // } diff --git a/compiler/uniq/src/sharing.rs b/compiler/uniq/src/sharing.rs index 7caa549f91..61a7569453 100644 --- a/compiler/uniq/src/sharing.rs +++ b/compiler/uniq/src/sharing.rs @@ -787,8 +787,7 @@ pub fn annotate_usage(expr: &Expr, usage: &mut VarUsage) { | Num(_, _) | Int(_, _) | Float(_, _) - | Str(_) - | BlockStr(_) + | Str { .. } | EmptyRecord | Accessor { .. } | RunLowLevel { .. } => {}