mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-29 06:44:46 +00:00
Fix string literal parsing
This commit is contained in:
parent
f35e43768a
commit
e661ca7f2e
2 changed files with 224 additions and 143 deletions
|
@ -2,11 +2,12 @@ use crate::ast::{Attempting, StrLiteral, StrSegment};
|
||||||
use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State};
|
use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State};
|
||||||
use bumpalo::collections::vec::Vec;
|
use bumpalo::collections::vec::Vec;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
|
use roc_region::all::{Located, Region};
|
||||||
|
|
||||||
pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
||||||
use StrLiteral::*;
|
use StrLiteral::*;
|
||||||
|
|
||||||
move |arena: &'a Bump, state: State<'a>| {
|
move |arena: &'a Bump, mut state: State<'a>| {
|
||||||
let mut bytes = state.bytes.iter();
|
let mut bytes = state.bytes.iter();
|
||||||
// String literals must start with a quote.
|
// String literals must start with a quote.
|
||||||
// If this doesn't, it must not be a string literal!
|
// If this doesn't, it must not be a string literal!
|
||||||
|
@ -21,14 +22,8 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// The current segment begins right after the opening quotation mark.
|
// Advance past the opening quotation mark.
|
||||||
let mut cur_segment = &state.bytes[1..];
|
state = state.advance_without_indenting(1)?;
|
||||||
|
|
||||||
enum EscapeState {
|
|
||||||
None,
|
|
||||||
Unicode,
|
|
||||||
Interpolation,
|
|
||||||
}
|
|
||||||
|
|
||||||
// At the parsing stage we keep the entire raw string, because the formatter
|
// At the parsing stage we keep the entire raw string, because the formatter
|
||||||
// needs the raw string. (For example, so it can "remember" whether you
|
// needs the raw string. (For example, so it can "remember" whether you
|
||||||
|
@ -36,75 +31,105 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
||||||
//
|
//
|
||||||
// Since we're keeping the entire raw string, all we need to track is
|
// Since we're keeping the entire raw string, all we need to track is
|
||||||
// how many characters we've parsed. So far, that's 1 (the opening `"`).
|
// how many characters we've parsed. So far, that's 1 (the opening `"`).
|
||||||
let mut total_parsed_chars = 1;
|
let mut segment_parsed_bytes = 0;
|
||||||
let mut segment_parsed_chars = 0;
|
|
||||||
let mut segments = Vec::new_in(arena);
|
let mut segments = Vec::new_in(arena);
|
||||||
let mut escape_state = EscapeState::None;
|
|
||||||
|
|
||||||
// pub enum StrSegment<'a> {
|
macro_rules! escaped_char {
|
||||||
// Plaintext(&'a str), // e.g. "foo"
|
($ch:expr) => {
|
||||||
// Unicode(&'a str), // e.g. "00A0" in "\u(00A0)"
|
// Record the escaped char.
|
||||||
// Interpolated(&'a str), // e.g. "name" in "Hi, \(name)!"
|
segments.push(StrSegment::EscapedChar($ch));
|
||||||
// EscapedChar(char), // e.g. '\n' in "Hello!\n"
|
|
||||||
// }
|
// Advance past the segment we just added
|
||||||
|
state = state.advance_without_indenting(segment_parsed_bytes)?;
|
||||||
|
|
||||||
|
// Reset the segment
|
||||||
|
segment_parsed_bytes = 0;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! end_segment {
|
||||||
|
($transform:expr) => {
|
||||||
|
dbg!("ending segment");
|
||||||
|
dbg!(segment_parsed_bytes - 1);
|
||||||
|
dbg!(&state.bytes);
|
||||||
|
|
||||||
|
// Don't push anything if the string would be empty.
|
||||||
|
if segment_parsed_bytes > 1 {
|
||||||
|
// This function is always called after we just parsed
|
||||||
|
// something which signalled that we should end the
|
||||||
|
// current segment - so use segment_parsed_bytes - 1 here,
|
||||||
|
// to exclude that char we just parsed.
|
||||||
|
let string_bytes = &state.bytes[0..(segment_parsed_bytes - 1)];
|
||||||
|
|
||||||
|
match parse_utf8(string_bytes) {
|
||||||
|
Ok(string) => {
|
||||||
|
state = state.advance_without_indenting(string.len())?;
|
||||||
|
|
||||||
|
segments.push($transform(string));
|
||||||
|
|
||||||
|
dbg!(&segments);
|
||||||
|
}
|
||||||
|
Err(reason) => {
|
||||||
|
return state.fail(reason);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If we parsed 0 bytes,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Depending on where this macro is used, in some
|
||||||
|
// places this is unused.
|
||||||
|
#[allow(unused_assignments)]
|
||||||
|
{
|
||||||
|
// This function is always called after we just parsed
|
||||||
|
// something which signalled that we should end the
|
||||||
|
// current segment.
|
||||||
|
segment_parsed_bytes = 1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
while let Some(&byte) = bytes.next() {
|
while let Some(&byte) = bytes.next() {
|
||||||
segment_parsed_chars += 1;
|
dbg!("Parsing {:?}", (byte as char).to_string());
|
||||||
|
// This is for the byte we just grabbed from the iterator.
|
||||||
|
segment_parsed_bytes += 1;
|
||||||
|
|
||||||
// Potentially end the string (unless this is an escaped `"`!)
|
|
||||||
match byte {
|
match byte {
|
||||||
b'"' => {
|
b'"' => {
|
||||||
// If we aren't escaping, then this is the end of the string!
|
// This is the end of the string!
|
||||||
if let EscapeState::None = escape_state {
|
if segment_parsed_bytes == 1 && segments.is_empty() {
|
||||||
let (literal, state) = if total_parsed_chars == 1 && segments.is_empty() {
|
match bytes.next() {
|
||||||
match bytes.next() {
|
Some(b'"') => {
|
||||||
Some(b'"') => {
|
// If the very first three chars were all `"`,
|
||||||
// If the very first three chars were all `"`,
|
// then this literal begins with `"""`
|
||||||
// then this literal begins with `"""`
|
// and is a block string.
|
||||||
// and is a block string.
|
return parse_block_string(arena, state, &mut bytes);
|
||||||
return parse_block_string(arena, state, &mut bytes);
|
}
|
||||||
|
_ => {
|
||||||
|
return Ok((PlainLine(""), state.advance_without_indenting(2)?));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
end_segment!(StrSegment::Plaintext);
|
||||||
|
|
||||||
|
let expr = if segments.len() == 1 {
|
||||||
|
// We had exactly one segment, so this is a candidate
|
||||||
|
// to be StrLiteral::Plaintext
|
||||||
|
match segments.pop().unwrap() {
|
||||||
|
StrSegment::Plaintext(string) => StrLiteral::PlainLine(string),
|
||||||
|
other => {
|
||||||
|
let vec = bumpalo::vec![in arena; other];
|
||||||
|
|
||||||
|
StrLiteral::LineWithEscapes(vec.into_bump_slice())
|
||||||
}
|
}
|
||||||
_ => (PlainLine(""), state.advance_without_indenting(2)?),
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Subtract 1 from parsed_chars so we omit the closing `"`.
|
LineWithEscapes(segments.into_bump_slice())
|
||||||
let string_bytes = &cur_segment[0..(segment_parsed_chars - 1)];
|
|
||||||
|
|
||||||
match parse_utf8(string_bytes) {
|
|
||||||
Ok(string) => {
|
|
||||||
total_parsed_chars += segment_parsed_chars;
|
|
||||||
|
|
||||||
let state =
|
|
||||||
state.advance_without_indenting(total_parsed_chars)?;
|
|
||||||
|
|
||||||
if segments.is_empty() {
|
|
||||||
// We only had one segment.
|
|
||||||
(StrLiteral::PlainLine(string), state)
|
|
||||||
} else {
|
|
||||||
// We had multiple segments! Parse the
|
|
||||||
// current one and add it to the list.
|
|
||||||
segments.push(StrSegment::Plaintext(string));
|
|
||||||
|
|
||||||
(LineWithEscapes(segments.into_bump_slice()), state)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(reason) => {
|
|
||||||
return state.fail(reason);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return Ok((literal, state));
|
// Advance the state 1 to account for the closing `"`
|
||||||
} else {
|
return Ok((expr, state.advance_without_indenting(1)?));
|
||||||
// We are escaping, so this is an error. (If it were an
|
};
|
||||||
// escaped single character like \" then we would have
|
|
||||||
// handled that scenario already.)
|
|
||||||
return Err(unexpected(
|
|
||||||
state.bytes.len() - 1,
|
|
||||||
state,
|
|
||||||
Attempting::StrLiteral,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
b'\n' => {
|
b'\n' => {
|
||||||
// This is a single-line string, which cannot have newlines!
|
// This is a single-line string, which cannot have newlines!
|
||||||
|
@ -118,75 +143,82 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
||||||
Attempting::StrLiteral,
|
Attempting::StrLiteral,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
b')' => {
|
|
||||||
// All escape sequences end in a close paren, so we don't
|
|
||||||
// need to pay for a conditional here. If it was an escape,
|
|
||||||
// then we want to set it to None, and if it wasn't an
|
|
||||||
// escape, then setting it from None to None is harmless!
|
|
||||||
// (And likely cheaper than a conditional.)
|
|
||||||
escape_state = EscapeState::None;
|
|
||||||
}
|
|
||||||
b'\\' => {
|
b'\\' => {
|
||||||
// This is the start of a new escape
|
// We're about to begin an escaped segment of some sort!
|
||||||
if let EscapeState::None = escape_state {
|
//
|
||||||
match bytes.next() {
|
// Record the current segment so we can begin a new one.
|
||||||
Some(b'(') => {
|
// End it right before the `\` char we just parsed.
|
||||||
// This is an interpolated variable
|
end_segment!(StrSegment::Plaintext);
|
||||||
escape_state = EscapeState::Interpolation;
|
|
||||||
todo!("Parse interpolated ident");
|
// This is for the byte we're about to parse.
|
||||||
|
segment_parsed_bytes += 1;
|
||||||
|
|
||||||
|
// This is the start of a new escape. Look at the next byte
|
||||||
|
// to figure out what type of escape it is.
|
||||||
|
match bytes.next() {
|
||||||
|
Some(b'(') => {
|
||||||
|
// This is an interpolated variable
|
||||||
|
todo!("Make a new parser state, then use it to parse ident followed by ')'");
|
||||||
|
}
|
||||||
|
Some(b'u') => {
|
||||||
|
// This is an escaped unicode character
|
||||||
|
if let Some(b'(') = bytes.next() {
|
||||||
|
segment_parsed_bytes += 1;
|
||||||
|
} else {
|
||||||
|
// Whenever we encounter `\u` it must be followed
|
||||||
|
// by a `(` char!
|
||||||
|
return Err(unexpected(0, state, Attempting::StrLiteral));
|
||||||
}
|
}
|
||||||
Some(b'u') => {
|
|
||||||
escape_state = EscapeState::Unicode;
|
while let Some(&byte) = bytes.next() {
|
||||||
// This is an escaped unicode character
|
segment_parsed_bytes += 1;
|
||||||
todo!("Parse '(' and then parse escaped unicode character");
|
|
||||||
}
|
if (byte as char).is_ascii_hexdigit() {
|
||||||
Some(ch @ b'\n') | Some(ch @ b'\t') | Some(ch @ b'\r')
|
// This is the most common case.
|
||||||
| Some(ch @ b'"') | Some(ch @ b'\\') => {
|
} else if byte == b')' {
|
||||||
// Record the current segment so we can begin a new one.
|
// Add the segment
|
||||||
match parse_utf8(cur_segment) {
|
end_segment!(|string: &'a str| {
|
||||||
Ok(string) => {
|
let value = &string[0..string.len() - 1];
|
||||||
segments.push(StrSegment::Plaintext(string));
|
|
||||||
}
|
StrSegment::Unicode(Located {
|
||||||
Err(reason) => {
|
region: Region::zero(), // TODO calculate the right region
|
||||||
return state.fail(reason);
|
value,
|
||||||
}
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
// We're done parsing digits now.
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// Unicode escapes must all be digits!
|
||||||
|
return Err(unexpected(0, state, Attempting::StrLiteral));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Record the escaped char.
|
|
||||||
segments.push(StrSegment::EscapedChar(*ch as char));
|
|
||||||
|
|
||||||
// We're now done escaping.
|
|
||||||
escape_state = EscapeState::None;
|
|
||||||
|
|
||||||
// Advance past the segment we just added, and
|
|
||||||
// also past the escaped char we just added.
|
|
||||||
//
|
|
||||||
// +2 because we just parsed a backslash and
|
|
||||||
// one other char after it.
|
|
||||||
cur_segment = &cur_segment[(segment_parsed_chars + 2)..];
|
|
||||||
|
|
||||||
// Reset segment_parsed_chars to 0 because we're now
|
|
||||||
// parsing the beginning of a new segment.
|
|
||||||
segment_parsed_chars = 0;
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
// Invalid escape! A backslash must be followed
|
|
||||||
// by either an open paren or else one of the
|
|
||||||
// escapable characters (\n, \t, \", \\, etc)
|
|
||||||
return Err(unexpected(
|
|
||||||
state.bytes.len() - 1,
|
|
||||||
state,
|
|
||||||
Attempting::StrLiteral,
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
Some(b'\\') => {
|
||||||
// Can't have a \ inside an escape!
|
escaped_char!('\\');
|
||||||
return Err(unexpected(
|
}
|
||||||
state.bytes.len() - 1,
|
Some(b'"') => {
|
||||||
state,
|
escaped_char!('"');
|
||||||
Attempting::StrLiteral,
|
}
|
||||||
));
|
Some(b'r') => {
|
||||||
|
escaped_char!('\r');
|
||||||
|
}
|
||||||
|
Some(b't') => {
|
||||||
|
escaped_char!('\t');
|
||||||
|
}
|
||||||
|
Some(b'n') => {
|
||||||
|
escaped_char!('\n');
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Invalid escape! A backslash must be followed
|
||||||
|
// by either an open paren or else one of the
|
||||||
|
// escapable characters (\n, \t, \", \\, etc)
|
||||||
|
return Err(unexpected(
|
||||||
|
state.bytes.len() - 1,
|
||||||
|
state,
|
||||||
|
Attempting::StrLiteral,
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
|
@ -197,7 +229,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
|
||||||
|
|
||||||
// We ran out of characters before finding a closed quote
|
// We ran out of characters before finding a closed quote
|
||||||
Err(unexpected_eof(
|
Err(unexpected_eof(
|
||||||
total_parsed_chars,
|
state.bytes.len(),
|
||||||
Attempting::StrLiteral,
|
Attempting::StrLiteral,
|
||||||
state.clone(),
|
state.clone(),
|
||||||
))
|
))
|
||||||
|
@ -236,7 +268,7 @@ where
|
||||||
|
|
||||||
return match parse_utf8(line_bytes) {
|
return match parse_utf8(line_bytes) {
|
||||||
Ok(_line) => {
|
Ok(_line) => {
|
||||||
// let state = state.advance_without_indenting(parsed_chars)?;
|
// state = state.advance_without_indenting(parsed_chars)?;
|
||||||
|
|
||||||
// lines.push(line);
|
// lines.push(line);
|
||||||
|
|
||||||
|
|
|
@ -25,8 +25,9 @@ mod test_parse {
|
||||||
use roc_parse::ast::Expr::{self, *};
|
use roc_parse::ast::Expr::{self, *};
|
||||||
use roc_parse::ast::Pattern::{self, *};
|
use roc_parse::ast::Pattern::{self, *};
|
||||||
use roc_parse::ast::StrLiteral::*;
|
use roc_parse::ast::StrLiteral::*;
|
||||||
|
use roc_parse::ast::StrSegment::*;
|
||||||
use roc_parse::ast::{
|
use roc_parse::ast::{
|
||||||
Attempting, Def, InterfaceHeader, Spaceable, Tag, TypeAnnotation, WhenBranch,
|
self, Attempting, Def, InterfaceHeader, Spaceable, Tag, TypeAnnotation, WhenBranch,
|
||||||
};
|
};
|
||||||
use roc_parse::header::ModuleName;
|
use roc_parse::header::ModuleName;
|
||||||
use roc_parse::module::{interface_header, module_defs};
|
use roc_parse::module::{interface_header, module_defs};
|
||||||
|
@ -49,6 +50,31 @@ mod test_parse {
|
||||||
assert_eq!(Err(expected_fail), actual);
|
assert_eq!(Err(expected_fail), actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parses_with_escaped_char<
|
||||||
|
I: Fn(&'static str) -> String,
|
||||||
|
E: Fn(char, &Bump) -> Vec<'_, ast::StrSegment<'static>>,
|
||||||
|
>(
|
||||||
|
to_input: I,
|
||||||
|
to_expected: E,
|
||||||
|
) {
|
||||||
|
let arena = Bump::new();
|
||||||
|
|
||||||
|
// Try parsing with each of the escaped chars Roc supports
|
||||||
|
for (string, ch) in &[
|
||||||
|
("\\\\", '\\'),
|
||||||
|
("\\n", '\n'),
|
||||||
|
("\\r", '\r'),
|
||||||
|
("\\t", '\t'),
|
||||||
|
("\\\"", '"'),
|
||||||
|
] {
|
||||||
|
let actual = parse_with(&arena, arena.alloc(to_input(string)));
|
||||||
|
let expected_slice = to_expected(*ch, &arena).into_bump_slice();
|
||||||
|
let expected_expr = Expr::Str(LineWithEscapes(expected_slice));
|
||||||
|
|
||||||
|
assert_eq!(Ok(expected_expr), actual);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// STRING LITERALS
|
// STRING LITERALS
|
||||||
|
|
||||||
fn expect_parsed_str(input: &str, expected: &str) {
|
fn expect_parsed_str(input: &str, expected: &str) {
|
||||||
|
@ -103,12 +129,35 @@ mod test_parse {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn string_with_special_escapes() {
|
fn string_with_escaped_char_at_end() {
|
||||||
expect_parsed_str(r#"x\\x"#, r#""x\\x""#);
|
parses_with_escaped_char(
|
||||||
expect_parsed_str(r#"x\"x"#, r#""x\"x""#);
|
|esc| format!(r#""abcd{}""#, esc),
|
||||||
expect_parsed_str(r#"x\tx"#, r#""x\tx""#);
|
|esc, arena| bumpalo::vec![in arena; Plaintext("abcd"), EscapedChar(esc)],
|
||||||
expect_parsed_str(r#"x\rx"#, r#""x\rx""#);
|
);
|
||||||
expect_parsed_str(r#"x\nx"#, r#""x\nx""#);
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn string_with_escaped_char_in_front() {
|
||||||
|
parses_with_escaped_char(
|
||||||
|
|esc| format!(r#""{}abcd""#, esc),
|
||||||
|
|esc, arena| bumpalo::vec![in arena; EscapedChar(esc), Plaintext("abcd")],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn string_with_escaped_char_in_middle() {
|
||||||
|
parses_with_escaped_char(
|
||||||
|
|esc| format!(r#""ab{}cd""#, esc),
|
||||||
|
|esc, arena| bumpalo::vec![in arena; Plaintext("ab"), EscapedChar(esc), Plaintext("cd")],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn string_with_multiple_escaped_chars() {
|
||||||
|
parses_with_escaped_char(
|
||||||
|
|esc| format!(r#""{}abc{}de{}fghi{}""#, esc, esc, esc, esc),
|
||||||
|
|esc, arena| bumpalo::vec![in arena; EscapedChar(esc), Plaintext("abc"), EscapedChar(esc), Plaintext("de"), EscapedChar(esc), Plaintext("fghi"), EscapedChar(esc)],
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -1861,12 +1910,12 @@ mod test_parse {
|
||||||
let arena = Bump::new();
|
let arena = Bump::new();
|
||||||
let newlines = bumpalo::vec![in &arena; Newline];
|
let newlines = bumpalo::vec![in &arena; Newline];
|
||||||
let pattern1 = Pattern::SpaceBefore(
|
let pattern1 = Pattern::SpaceBefore(
|
||||||
arena.alloc(StrLiteral(PlainLine("blah"))),
|
arena.alloc(StrLiteral(PlainLine(""))),
|
||||||
newlines.into_bump_slice(),
|
newlines.into_bump_slice(),
|
||||||
);
|
);
|
||||||
let loc_pattern1 = Located::new(1, 1, 1, 7, pattern1);
|
let loc_pattern1 = Located::new(1, 1, 1, 4, pattern1);
|
||||||
let expr1 = Num("1");
|
let expr1 = Num("1");
|
||||||
let loc_expr1 = Located::new(1, 1, 11, 12, expr1);
|
let loc_expr1 = Located::new(1, 1, 7, 8, expr1);
|
||||||
let branch1 = &*arena.alloc(WhenBranch {
|
let branch1 = &*arena.alloc(WhenBranch {
|
||||||
patterns: bumpalo::vec![in &arena;loc_pattern1],
|
patterns: bumpalo::vec![in &arena;loc_pattern1],
|
||||||
value: loc_expr1,
|
value: loc_expr1,
|
||||||
|
@ -1897,7 +1946,7 @@ mod test_parse {
|
||||||
indoc!(
|
indoc!(
|
||||||
r#"
|
r#"
|
||||||
when x is
|
when x is
|
||||||
"blah" -> 1
|
"" -> 1
|
||||||
"mise" -> 2
|
"mise" -> 2
|
||||||
"#
|
"#
|
||||||
),
|
),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue