diff --git a/crates/compiler/parse/src/parser.rs b/crates/compiler/parse/src/parser.rs index aef8cb8954..e0f53fcc1e 100644 --- a/crates/compiler/parse/src/parser.rs +++ b/crates/compiler/parse/src/parser.rs @@ -384,6 +384,7 @@ pub enum EString<'a> { UnknownEscape(Position), Format(&'a EExpr<'a>, Position), FormatEnd(Position), + MultilineInsufficientIndent(Position), } #[derive(Debug, Clone, PartialEq, Eq)] diff --git a/crates/compiler/parse/src/state.rs b/crates/compiler/parse/src/state.rs index 617909bf7d..ec111fb664 100644 --- a/crates/compiler/parse/src/state.rs +++ b/crates/compiler/parse/src/state.rs @@ -37,6 +37,25 @@ impl<'a> State<'a> { self.pos().offset - self.line_start.offset } + /// Mutably advance the state by a given offset + #[inline(always)] + pub(crate) fn advance_mut(&mut self, offset: usize) { + self.offset += offset; + } + + /// If the next `text.len()` bytes of the input match the provided `text`, + /// mutably advance the state by that much. + #[inline(always)] + pub(crate) fn consume_mut(&mut self, text: &str) -> bool { + let found = self.bytes().starts_with(text.as_bytes()); + + if found { + self.advance_mut(text.len()); + } + + found + } + #[must_use] #[inline(always)] pub(crate) const fn advance(mut self, offset: usize) -> State<'a> { diff --git a/crates/compiler/parse/src/string_literal.rs b/crates/compiler/parse/src/string_literal.rs index 4d77dc523f..d92d77c794 100644 --- a/crates/compiler/parse/src/string_literal.rs +++ b/crates/compiler/parse/src/string_literal.rs @@ -1,6 +1,6 @@ use crate::ast::{EscapedChar, StrLiteral, StrSegment}; use crate::expr; -use crate::parser::Progress::*; +use crate::parser::Progress::{self, *}; use crate::parser::{allocated, loc, specialize_ref, word1, BadInputError, EString, Parser}; use crate::state::State; use bumpalo::collections::vec::Vec; @@ -9,7 +9,7 @@ use bumpalo::Bump; /// One or more ASCII hex digits. (Useful when parsing unicode escape codes, /// which must consist entirely of ASCII hex digits.) fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> { - move |arena, state: State<'a>| { + move |arena, mut state: State<'a>| { let mut buf = bumpalo::collections::String::new_in(arena); for &byte in state.bytes().iter() { @@ -19,7 +19,7 @@ fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> { // We didn't find any hex digits! return Err((NoProgress, EString::CodePtEnd(state.pos()), state)); } else { - let state = state.advance(buf.len()); + state.advance_mut(buf.len()); return Ok((MadeProgress, buf.into_bump_str(), state)); } @@ -29,36 +29,27 @@ fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> { } } -macro_rules! advance_state { - ($state:expr, $n:expr) => { - Ok($state.advance($n)) - }; -} - pub fn parse_single_quote<'a>() -> impl Parser<'a, &'a str, EString<'a>> { move |arena: &'a Bump, mut state: State<'a>| { - if state.bytes().starts_with(b"\'") { + if state.consume_mut("\'") { // we will be parsing a single-quote-string } else { return Err((NoProgress, EString::Open(state.pos()), state)); } - // early return did not hit, just advance one byte - state = advance_state!(state, 1)?; - // Handle back slaches in byte literal // - starts with a backslash and used as an escape character. ex: '\n', '\t' // - single quote floating (un closed single quote) should be an error match state.bytes().first() { Some(b'\\') => { - state = advance_state!(state, 1)?; + state.advance_mut(1); match state.bytes().first() { Some(&ch) => { - state = advance_state!(state, 1)?; + state.advance_mut(1); if (ch == b'n' || ch == b'r' || ch == b't' || ch == b'\'' || ch == b'\\') && (state.bytes().first() == Some(&b'\'')) { - state = advance_state!(state, 1)?; + state.advance_mut(1); let test = match ch { b'n' => '\n', b't' => '\t', @@ -118,7 +109,7 @@ pub fn parse_single_quote<'a>() -> impl Parser<'a, &'a str, EString<'a>> { // ending up w/ a slice of bytes that we want to convert into an integer let raw_bytes = &state.bytes()[0..end_index - 1]; - state = advance_state!(state, end_index)?; + state.advance_mut(end_index); match std::str::from_utf8(raw_bytes) { Ok(string) => Ok((MadeProgress, string, state)), Err(_) => { @@ -129,33 +120,78 @@ pub fn parse_single_quote<'a>() -> impl Parser<'a, &'a str, EString<'a>> { } } +fn consume_indent<'a>( + mut state: State<'a>, + mut indent: u32, +) -> Result, State<'a>)> { + while indent > 0 { + match state.bytes().first() { + Some(b' ') => { + state.advance_mut(1); + indent -= 1; + } + None | Some(b'\n') => { + break; + } + Some(_) => { + return Err(( + MadeProgress, + EString::MultilineInsufficientIndent(state.pos()), + state, + )); + } + } + } + + Ok(state) +} + +fn utf8<'a>( + state: State<'a>, + string_bytes: &'a [u8], +) -> Result<&'a str, (Progress, EString<'a>, State<'a>)> { + std::str::from_utf8(string_bytes).map_err(|_| { + // Note Based on where this `utf8` function is used, the fact that we know the whole string + // in the parser is valid utf8, and barring bugs in the parser itself + // (e.g. where we accidentally split a multibyte utf8 char), this error _should_ actually be unreachable. + ( + MadeProgress, + EString::Space(BadInputError::BadUtf8, state.pos()), + state, + ) + }) +} + pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { use StrLiteral::*; move |arena: &'a Bump, mut state: State<'a>| { let is_multiline; - let mut bytes; - if state.bytes().starts_with(b"\"\"\"") { - // we will be parsing a multi-string + let indent = state.column(); + + let start_state; + + if state.consume_mut("\"\"\"") { + start_state = state.clone(); + + // we will be parsing a multi-line string is_multiline = true; - bytes = state.bytes()[3..].iter(); - state = advance_state!(state, 3)?; - } else if state.bytes().starts_with(b"\"") { - // we will be parsing a single-string + + if state.consume_mut("\n") { + state = consume_indent(state, indent)?; + } + } else if state.consume_mut("\"") { + start_state = state.clone(); + + // we will be parsing a single-line string is_multiline = false; - bytes = state.bytes()[1..].iter(); - state = advance_state!(state, 1)?; } else { return Err((NoProgress, EString::Open(state.pos()), state)); } - // At the parsing stage we keep the entire raw string, because the formatter - // needs the raw string. (For example, so it can "remember" whether you - // wrote \u{...} or the actual unicode character itself.) - // - // Since we're keeping the entire raw string, all we need to track is - // how many characters we've parsed. So far, that's 1 (the opening `"`). + let mut bytes = state.bytes().iter(); + let mut segment_parsed_bytes = 0; let mut segments = Vec::new_in(arena); @@ -165,7 +201,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { segments.push(StrSegment::EscapedChar($ch)); // Advance past the segment we just added - state = advance_state!(state, segment_parsed_bytes)?; + state.advance_mut(segment_parsed_bytes); // Reset the segment segment_parsed_bytes = 0; @@ -184,7 +220,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { match std::str::from_utf8(string_bytes) { Ok(string) => { - state = advance_state!(state, string.len())?; + state.advance_mut(string.len()); segments.push($transform(string)); } @@ -220,7 +256,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { // special case of the empty string if is_multiline { if bytes.as_slice().starts_with(b"\"\"") { - return Ok((MadeProgress, Block(&[]), advance_state!(state, 3)?)); + return Ok((MadeProgress, Block(&[]), state.advance(3))); } else { // this quote is in a block string continue; @@ -228,7 +264,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { } else { // This is the end of the string! // Advance 1 for the close quote - return Ok((MadeProgress, PlainLine(""), advance_state!(state, 1)?)); + return Ok((MadeProgress, PlainLine(""), state.advance(1))); } } else { // the string is non-empty, which means we need to convert any previous segments @@ -250,7 +286,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { Block(arena.alloc([segments.into_bump_slice()])) }; - return Ok((MadeProgress, expr, advance_state!(state, 3)?)); + return Ok((MadeProgress, expr, state.advance(3))); } else { // this quote is in a block string continue; @@ -270,12 +306,30 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { }; // Advance the state 1 to account for the closing `"` - return Ok((MadeProgress, expr, advance_state!(state, 1)?)); + return Ok((MadeProgress, expr, state.advance(1))); } }; } b'\n' => { if is_multiline { + let without_newline = &state.bytes()[0..(segment_parsed_bytes - 1)]; + let with_newline = &state.bytes()[0..segment_parsed_bytes]; + + state.advance_mut(segment_parsed_bytes); + state = consume_indent(state, indent)?; + bytes = state.bytes().iter(); + + if state.bytes().starts_with(b"\"\"\"") { + // ending the string; don't use the last newline + segments + .push(StrSegment::Plaintext(utf8(state.clone(), without_newline)?)); + } else { + segments + .push(StrSegment::Plaintext(utf8(state.clone(), with_newline)?)); + } + + segment_parsed_bytes = 0; + continue; } else { // This is a single-line string, which cannot have newlines! @@ -283,7 +337,11 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { // all remaining chars. This will mask all other errors, but // it should make it easiest to debug; the file will be a giant // error starting from where the open quote appeared. - return Err((MadeProgress, EString::EndlessSingle(state.pos()), state)); + return Err(( + MadeProgress, + EString::EndlessSingle(start_state.pos()), + start_state, + )); } } b'\\' => { @@ -301,7 +359,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { match bytes.next() { Some(b'(') => { // Advance past the `\(` before using the expr parser - state = advance_state!(state, 2)?; + state.advance_mut(2); let original_byte_count = state.bytes().len(); @@ -328,7 +386,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { } Some(b'u') => { // Advance past the `\u` before using the expr parser - state = advance_state!(state, 2)?; + state.advance_mut(2); let original_byte_count = state.bytes().len(); @@ -386,11 +444,11 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { Err(( MadeProgress, if is_multiline { - EString::EndlessMulti(state.pos()) + EString::EndlessMulti(start_state.pos()) } else { - EString::EndlessSingle(state.pos()) + EString::EndlessSingle(start_state.pos()) }, - state, + start_state, )) } } diff --git a/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.result-ast b/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.result-ast new file mode 100644 index 0000000000..8d34036ead --- /dev/null +++ b/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.result-ast @@ -0,0 +1,114 @@ +Defs( + Defs { + tags: [ + Index(2147483648), + Index(2147483649), + Index(2147483650), + ], + regions: [ + @0-22, + @23-49, + @50-92, + ], + space_before: [ + Slice(start = 0, length = 0), + Slice(start = 0, length = 1), + Slice(start = 1, length = 1), + ], + space_after: [ + Slice(start = 0, length = 0), + Slice(start = 1, length = 0), + Slice(start = 2, length = 0), + ], + spaces: [ + Newline, + Newline, + ], + type_defs: [], + value_defs: [ + Body( + @0-1 Identifier( + "a", + ), + @4-22 Str( + Line( + [ + Plaintext( + "Hello,", + ), + EscapedChar( + Newline, + ), + EscapedChar( + Newline, + ), + Plaintext( + "World!", + ), + ], + ), + ), + ), + Body( + @23-24 Identifier( + "b", + ), + @27-49 Str( + Block( + [ + [ + Plaintext( + "Hello,", + ), + EscapedChar( + Newline, + ), + EscapedChar( + Newline, + ), + Plaintext( + "World!", + ), + ], + ], + ), + ), + ), + Body( + @50-51 Identifier( + "c", + ), + @58-92 SpaceBefore( + Str( + Block( + [ + [ + Plaintext( + "Hello,\n", + ), + Plaintext( + "\n", + ), + Plaintext( + "World!", + ), + ], + ], + ), + ), + [ + Newline, + ], + ), + ), + ], + }, + @93-95 SpaceBefore( + Num( + "42", + ), + [ + Newline, + ], + ), +) diff --git a/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.roc b/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.roc new file mode 100644 index 0000000000..a647df6e57 --- /dev/null +++ b/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.roc @@ -0,0 +1,9 @@ +a = "Hello,\n\nWorld!" +b = """Hello,\n\nWorld!""" +c = + """ + Hello, + + World! + """ +42 diff --git a/crates/compiler/parse/tests/test_parse.rs b/crates/compiler/parse/tests/test_parse.rs index b01a2d19c1..633ac32ea8 100644 --- a/crates/compiler/parse/tests/test_parse.rs +++ b/crates/compiler/parse/tests/test_parse.rs @@ -204,6 +204,7 @@ mod test_parse { pass/not_docs.expr, pass/number_literal_suffixes.expr, pass/one_backpassing.expr, + pass/multiline_string.expr, pass/one_char_string.expr, pass/one_def.expr, pass/one_minus_two.expr, diff --git a/crates/reporting/src/error/parse.rs b/crates/reporting/src/error/parse.rs index c989bc5b09..88f8430899 100644 --- a/crates/reporting/src/error/parse.rs +++ b/crates/reporting/src/error/parse.rs @@ -921,6 +921,27 @@ fn to_str_report<'a>( severity: Severity::RuntimeError, } } + EString::MultilineInsufficientIndent(pos) => { + let surroundings = Region::new(start, pos); + let region = LineColumnRegion::from_pos(lines.convert_pos(pos)); + + let doc = alloc.stack([ + alloc.reflow(r"This multiline string is not sufficiently indented:"), + alloc.region_with_subregion(lines.convert_region(surroundings), region), + alloc.concat([ + alloc.reflow(r"Lines in a multi-line string must be indented at least as "), + alloc.reflow("much as the beginning \"\"\". This extra indentation is automatically removed "), + alloc.reflow("from the string during compilation."), + ]), + ]); + + Report { + filename, + doc, + title: "INSUFFICIENT INDENT IN MULTI-LINE STRING".to_string(), + severity: Severity::RuntimeError, + } + } } } fn to_expr_in_parens_report<'a>( diff --git a/crates/reporting/tests/test_reporting.rs b/crates/reporting/tests/test_reporting.rs index 412857021a..9c9d460116 100644 --- a/crates/reporting/tests/test_reporting.rs +++ b/crates/reporting/tests/test_reporting.rs @@ -5228,6 +5228,23 @@ mod test_reporting { "### ); + test_report!( + multi_insufficient_indent, + " \"\"\"\n testing\n \"\"\"", // 4 space indent on the start, 2 space on the `testing` line + @r###" + ── INSUFFICIENT INDENT IN MULTI-LINE STRING ─ ..._insufficient_indent/Test.roc ─ + + This multiline string is not sufficiently indented: + + 5│ testing + ^ + + Lines in a multi-line string must be indented at least as much as the + beginning """. This extra indentation is automatically removed from + the string during compilation. + "### + ); + // https://github.com/rtfeldman/roc/issues/1714 test_report!( interpolate_concat_is_transparent_1714,