Fix string literal parsing

This commit is contained in:
Richard Feldman 2020-08-29 11:20:10 -04:00
parent f35e43768a
commit e661ca7f2e
2 changed files with 224 additions and 143 deletions

View file

@ -2,11 +2,12 @@ use crate::ast::{Attempting, StrLiteral, StrSegment};
use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State}; use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State};
use bumpalo::collections::vec::Vec; use bumpalo::collections::vec::Vec;
use bumpalo::Bump; use bumpalo::Bump;
use roc_region::all::{Located, Region};
pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> { pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
use StrLiteral::*; use StrLiteral::*;
move |arena: &'a Bump, state: State<'a>| { move |arena: &'a Bump, mut state: State<'a>| {
let mut bytes = state.bytes.iter(); let mut bytes = state.bytes.iter();
// String literals must start with a quote. // String literals must start with a quote.
// If this doesn't, it must not be a string literal! // If this doesn't, it must not be a string literal!
@ -21,14 +22,8 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
} }
} }
// The current segment begins right after the opening quotation mark. // Advance past the opening quotation mark.
let mut cur_segment = &state.bytes[1..]; state = state.advance_without_indenting(1)?;
enum EscapeState {
None,
Unicode,
Interpolation,
}
// At the parsing stage we keep the entire raw string, because the formatter // At the parsing stage we keep the entire raw string, because the formatter
// needs the raw string. (For example, so it can "remember" whether you // needs the raw string. (For example, so it can "remember" whether you
@ -36,27 +31,73 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
// //
// Since we're keeping the entire raw string, all we need to track is // Since we're keeping the entire raw string, all we need to track is
// how many characters we've parsed. So far, that's 1 (the opening `"`). // how many characters we've parsed. So far, that's 1 (the opening `"`).
let mut total_parsed_chars = 1; let mut segment_parsed_bytes = 0;
let mut segment_parsed_chars = 0;
let mut segments = Vec::new_in(arena); let mut segments = Vec::new_in(arena);
let mut escape_state = EscapeState::None;
// pub enum StrSegment<'a> { macro_rules! escaped_char {
// Plaintext(&'a str), // e.g. "foo" ($ch:expr) => {
// Unicode(&'a str), // e.g. "00A0" in "\u(00A0)" // Record the escaped char.
// Interpolated(&'a str), // e.g. "name" in "Hi, \(name)!" segments.push(StrSegment::EscapedChar($ch));
// EscapedChar(char), // e.g. '\n' in "Hello!\n"
// } // Advance past the segment we just added
state = state.advance_without_indenting(segment_parsed_bytes)?;
// Reset the segment
segment_parsed_bytes = 0;
};
}
macro_rules! end_segment {
($transform:expr) => {
dbg!("ending segment");
dbg!(segment_parsed_bytes - 1);
dbg!(&state.bytes);
// Don't push anything if the string would be empty.
if segment_parsed_bytes > 1 {
// This function is always called after we just parsed
// something which signalled that we should end the
// current segment - so use segment_parsed_bytes - 1 here,
// to exclude that char we just parsed.
let string_bytes = &state.bytes[0..(segment_parsed_bytes - 1)];
match parse_utf8(string_bytes) {
Ok(string) => {
state = state.advance_without_indenting(string.len())?;
segments.push($transform(string));
dbg!(&segments);
}
Err(reason) => {
return state.fail(reason);
}
}
} else {
// If we parsed 0 bytes,
}
// Depending on where this macro is used, in some
// places this is unused.
#[allow(unused_assignments)]
{
// This function is always called after we just parsed
// something which signalled that we should end the
// current segment.
segment_parsed_bytes = 1;
}
};
}
while let Some(&byte) = bytes.next() { while let Some(&byte) = bytes.next() {
segment_parsed_chars += 1; dbg!("Parsing {:?}", (byte as char).to_string());
// This is for the byte we just grabbed from the iterator.
segment_parsed_bytes += 1;
// Potentially end the string (unless this is an escaped `"`!)
match byte { match byte {
b'"' => { b'"' => {
// If we aren't escaping, then this is the end of the string! // This is the end of the string!
if let EscapeState::None = escape_state { if segment_parsed_bytes == 1 && segments.is_empty() {
let (literal, state) = if total_parsed_chars == 1 && segments.is_empty() {
match bytes.next() { match bytes.next() {
Some(b'"') => { Some(b'"') => {
// If the very first three chars were all `"`, // If the very first three chars were all `"`,
@ -64,47 +105,31 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
// and is a block string. // and is a block string.
return parse_block_string(arena, state, &mut bytes); return parse_block_string(arena, state, &mut bytes);
} }
_ => (PlainLine(""), state.advance_without_indenting(2)?), _ => {
return Ok((PlainLine(""), state.advance_without_indenting(2)?));
}
} }
} else { } else {
// Subtract 1 from parsed_chars so we omit the closing `"`. end_segment!(StrSegment::Plaintext);
let string_bytes = &cur_segment[0..(segment_parsed_chars - 1)];
match parse_utf8(string_bytes) { let expr = if segments.len() == 1 {
Ok(string) => { // We had exactly one segment, so this is a candidate
total_parsed_chars += segment_parsed_chars; // to be StrLiteral::Plaintext
match segments.pop().unwrap() {
StrSegment::Plaintext(string) => StrLiteral::PlainLine(string),
other => {
let vec = bumpalo::vec![in arena; other];
let state = StrLiteral::LineWithEscapes(vec.into_bump_slice())
state.advance_without_indenting(total_parsed_chars)?; }
}
if segments.is_empty() {
// We only had one segment.
(StrLiteral::PlainLine(string), state)
} else { } else {
// We had multiple segments! Parse the LineWithEscapes(segments.into_bump_slice())
// current one and add it to the list.
segments.push(StrSegment::Plaintext(string));
(LineWithEscapes(segments.into_bump_slice()), state)
}
}
Err(reason) => {
return state.fail(reason);
}
}
}; };
return Ok((literal, state)); // Advance the state 1 to account for the closing `"`
} else { return Ok((expr, state.advance_without_indenting(1)?));
// We are escaping, so this is an error. (If it were an };
// escaped single character like \" then we would have
// handled that scenario already.)
return Err(unexpected(
state.bytes.len() - 1,
state,
Attempting::StrLiteral,
));
}
} }
b'\n' => { b'\n' => {
// This is a single-line string, which cannot have newlines! // This is a single-line string, which cannot have newlines!
@ -118,56 +143,71 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
Attempting::StrLiteral, Attempting::StrLiteral,
)); ));
} }
b')' => {
// All escape sequences end in a close paren, so we don't
// need to pay for a conditional here. If it was an escape,
// then we want to set it to None, and if it wasn't an
// escape, then setting it from None to None is harmless!
// (And likely cheaper than a conditional.)
escape_state = EscapeState::None;
}
b'\\' => { b'\\' => {
// This is the start of a new escape // We're about to begin an escaped segment of some sort!
if let EscapeState::None = escape_state { //
// Record the current segment so we can begin a new one.
// End it right before the `\` char we just parsed.
end_segment!(StrSegment::Plaintext);
// This is for the byte we're about to parse.
segment_parsed_bytes += 1;
// This is the start of a new escape. Look at the next byte
// to figure out what type of escape it is.
match bytes.next() { match bytes.next() {
Some(b'(') => { Some(b'(') => {
// This is an interpolated variable // This is an interpolated variable
escape_state = EscapeState::Interpolation; todo!("Make a new parser state, then use it to parse ident followed by ')'");
todo!("Parse interpolated ident");
} }
Some(b'u') => { Some(b'u') => {
escape_state = EscapeState::Unicode;
// This is an escaped unicode character // This is an escaped unicode character
todo!("Parse '(' and then parse escaped unicode character"); if let Some(b'(') = bytes.next() {
} segment_parsed_bytes += 1;
Some(ch @ b'\n') | Some(ch @ b'\t') | Some(ch @ b'\r') } else {
| Some(ch @ b'"') | Some(ch @ b'\\') => { // Whenever we encounter `\u` it must be followed
// Record the current segment so we can begin a new one. // by a `(` char!
match parse_utf8(cur_segment) { return Err(unexpected(0, state, Attempting::StrLiteral));
Ok(string) => {
segments.push(StrSegment::Plaintext(string));
}
Err(reason) => {
return state.fail(reason);
}
} }
// Record the escaped char. while let Some(&byte) = bytes.next() {
segments.push(StrSegment::EscapedChar(*ch as char)); segment_parsed_bytes += 1;
// We're now done escaping. if (byte as char).is_ascii_hexdigit() {
escape_state = EscapeState::None; // This is the most common case.
} else if byte == b')' {
// Add the segment
end_segment!(|string: &'a str| {
let value = &string[0..string.len() - 1];
// Advance past the segment we just added, and StrSegment::Unicode(Located {
// also past the escaped char we just added. region: Region::zero(), // TODO calculate the right region
// value,
// +2 because we just parsed a backslash and })
// one other char after it. });
cur_segment = &cur_segment[(segment_parsed_chars + 2)..];
// Reset segment_parsed_chars to 0 because we're now // We're done parsing digits now.
// parsing the beginning of a new segment. break;
segment_parsed_chars = 0; } else {
// Unicode escapes must all be digits!
return Err(unexpected(0, state, Attempting::StrLiteral));
}
}
}
Some(b'\\') => {
escaped_char!('\\');
}
Some(b'"') => {
escaped_char!('"');
}
Some(b'r') => {
escaped_char!('\r');
}
Some(b't') => {
escaped_char!('\t');
}
Some(b'n') => {
escaped_char!('\n');
} }
_ => { _ => {
// Invalid escape! A backslash must be followed // Invalid escape! A backslash must be followed
@ -180,14 +220,6 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
)); ));
} }
} }
} else {
// Can't have a \ inside an escape!
return Err(unexpected(
state.bytes.len() - 1,
state,
Attempting::StrLiteral,
));
}
} }
_ => { _ => {
// All other characters need no special handling. // All other characters need no special handling.
@ -197,7 +229,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
// We ran out of characters before finding a closed quote // We ran out of characters before finding a closed quote
Err(unexpected_eof( Err(unexpected_eof(
total_parsed_chars, state.bytes.len(),
Attempting::StrLiteral, Attempting::StrLiteral,
state.clone(), state.clone(),
)) ))
@ -236,7 +268,7 @@ where
return match parse_utf8(line_bytes) { return match parse_utf8(line_bytes) {
Ok(_line) => { Ok(_line) => {
// let state = state.advance_without_indenting(parsed_chars)?; // state = state.advance_without_indenting(parsed_chars)?;
// lines.push(line); // lines.push(line);

View file

@ -25,8 +25,9 @@ mod test_parse {
use roc_parse::ast::Expr::{self, *}; use roc_parse::ast::Expr::{self, *};
use roc_parse::ast::Pattern::{self, *}; use roc_parse::ast::Pattern::{self, *};
use roc_parse::ast::StrLiteral::*; use roc_parse::ast::StrLiteral::*;
use roc_parse::ast::StrSegment::*;
use roc_parse::ast::{ use roc_parse::ast::{
Attempting, Def, InterfaceHeader, Spaceable, Tag, TypeAnnotation, WhenBranch, self, Attempting, Def, InterfaceHeader, Spaceable, Tag, TypeAnnotation, WhenBranch,
}; };
use roc_parse::header::ModuleName; use roc_parse::header::ModuleName;
use roc_parse::module::{interface_header, module_defs}; use roc_parse::module::{interface_header, module_defs};
@ -49,6 +50,31 @@ mod test_parse {
assert_eq!(Err(expected_fail), actual); assert_eq!(Err(expected_fail), actual);
} }
fn parses_with_escaped_char<
I: Fn(&'static str) -> String,
E: Fn(char, &Bump) -> Vec<'_, ast::StrSegment<'static>>,
>(
to_input: I,
to_expected: E,
) {
let arena = Bump::new();
// Try parsing with each of the escaped chars Roc supports
for (string, ch) in &[
("\\\\", '\\'),
("\\n", '\n'),
("\\r", '\r'),
("\\t", '\t'),
("\\\"", '"'),
] {
let actual = parse_with(&arena, arena.alloc(to_input(string)));
let expected_slice = to_expected(*ch, &arena).into_bump_slice();
let expected_expr = Expr::Str(LineWithEscapes(expected_slice));
assert_eq!(Ok(expected_expr), actual);
}
}
// STRING LITERALS // STRING LITERALS
fn expect_parsed_str(input: &str, expected: &str) { fn expect_parsed_str(input: &str, expected: &str) {
@ -103,12 +129,35 @@ mod test_parse {
} }
#[test] #[test]
fn string_with_special_escapes() { fn string_with_escaped_char_at_end() {
expect_parsed_str(r#"x\\x"#, r#""x\\x""#); parses_with_escaped_char(
expect_parsed_str(r#"x\"x"#, r#""x\"x""#); |esc| format!(r#""abcd{}""#, esc),
expect_parsed_str(r#"x\tx"#, r#""x\tx""#); |esc, arena| bumpalo::vec![in arena; Plaintext("abcd"), EscapedChar(esc)],
expect_parsed_str(r#"x\rx"#, r#""x\rx""#); );
expect_parsed_str(r#"x\nx"#, r#""x\nx""#); }
#[test]
fn string_with_escaped_char_in_front() {
parses_with_escaped_char(
|esc| format!(r#""{}abcd""#, esc),
|esc, arena| bumpalo::vec![in arena; EscapedChar(esc), Plaintext("abcd")],
);
}
#[test]
fn string_with_escaped_char_in_middle() {
parses_with_escaped_char(
|esc| format!(r#""ab{}cd""#, esc),
|esc, arena| bumpalo::vec![in arena; Plaintext("ab"), EscapedChar(esc), Plaintext("cd")],
);
}
#[test]
fn string_with_multiple_escaped_chars() {
parses_with_escaped_char(
|esc| format!(r#""{}abc{}de{}fghi{}""#, esc, esc, esc, esc),
|esc, arena| bumpalo::vec![in arena; EscapedChar(esc), Plaintext("abc"), EscapedChar(esc), Plaintext("de"), EscapedChar(esc), Plaintext("fghi"), EscapedChar(esc)],
);
} }
#[test] #[test]
@ -1861,12 +1910,12 @@ mod test_parse {
let arena = Bump::new(); let arena = Bump::new();
let newlines = bumpalo::vec![in &arena; Newline]; let newlines = bumpalo::vec![in &arena; Newline];
let pattern1 = Pattern::SpaceBefore( let pattern1 = Pattern::SpaceBefore(
arena.alloc(StrLiteral(PlainLine("blah"))), arena.alloc(StrLiteral(PlainLine(""))),
newlines.into_bump_slice(), newlines.into_bump_slice(),
); );
let loc_pattern1 = Located::new(1, 1, 1, 7, pattern1); let loc_pattern1 = Located::new(1, 1, 1, 4, pattern1);
let expr1 = Num("1"); let expr1 = Num("1");
let loc_expr1 = Located::new(1, 1, 11, 12, expr1); let loc_expr1 = Located::new(1, 1, 7, 8, expr1);
let branch1 = &*arena.alloc(WhenBranch { let branch1 = &*arena.alloc(WhenBranch {
patterns: bumpalo::vec![in &arena;loc_pattern1], patterns: bumpalo::vec![in &arena;loc_pattern1],
value: loc_expr1, value: loc_expr1,
@ -1897,7 +1946,7 @@ mod test_parse {
indoc!( indoc!(
r#" r#"
when x is when x is
"blah" -> 1 "" -> 1
"mise" -> 2 "mise" -> 2
"# "#
), ),