diff --git a/compiler/parse/src/ast.rs b/compiler/parse/src/ast.rs index fcd1792b3e..9435c8cdb3 100644 --- a/compiler/parse/src/ast.rs +++ b/compiler/parse/src/ast.rs @@ -609,6 +609,7 @@ pub enum Attempting { Module, Record, Identifier, + HexDigit, ConcreteType, TypeVariable, WhenCondition, diff --git a/compiler/parse/src/parser.rs b/compiler/parse/src/parser.rs index fff068818a..f83b9cbc46 100644 --- a/compiler/parse/src/parser.rs +++ b/compiler/parse/src/parser.rs @@ -445,6 +445,29 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> { } } +/// One or more ASCII hex digits. (Useful when parsing unicode escape codes, +/// which must consist entirely of ASCII hex digits.) +pub fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str> { + move |arena, state: State<'a>| { + let mut buf = bumpalo::collections::String::new_in(arena); + + for &byte in state.bytes.iter() { + if (byte as char).is_ascii_hexdigit() { + buf.push(byte as char); + } else if buf.is_empty() { + // We didn't find any hex digits! + return Err(unexpected(0, state, Attempting::Keyword)); + } else { + let state = state.advance_without_indenting(buf.len())?; + + return Ok((buf.into_bump_str(), state)); + } + } + + Err(unexpected_eof(0, Attempting::HexDigit, state)) + } +} + /// A single UTF-8-encoded char. This will both parse *and* validate that the /// char is valid UTF-8, but it will *not* advance the state. pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> { diff --git a/compiler/parse/src/string_literal.rs b/compiler/parse/src/string_literal.rs index d34f43b137..25aa9a0db6 100644 --- a/compiler/parse/src/string_literal.rs +++ b/compiler/parse/src/string_literal.rs @@ -1,11 +1,11 @@ use crate::ast::{Attempting, StrLiteral, StrSegment}; use crate::expr; use crate::parser::{ - allocated, ascii_char, loc, parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State, + allocated, ascii_char, ascii_hex_digits, loc, parse_utf8, unexpected, unexpected_eof, + ParseResult, Parser, State, }; use bumpalo::collections::vec::Vec; use bumpalo::Bump; -use roc_region::all::{Located, Region}; pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> { use StrLiteral::*; @@ -176,38 +176,28 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> { state = new_state; } Some(b'u') => { - // This is an escaped unicode character - if let Some(b'(') = bytes.next() { - segment_parsed_bytes += 1; - } else { - // Whenever we encounter `\u` it must be followed - // by a `(` char! - return Err(unexpected(0, state, Attempting::StrLiteral)); + // Advance past the `\u` before using the expr parser + state = state.advance_without_indenting(2)?; + + let original_byte_count = state.bytes.len(); + + // Parse the hex digits, surrounded by parens, then + // give a canonicalization error if the digits form + // an invalid unicode code point. + let (loc_digits, new_state) = + between!(ascii_char('('), loc(ascii_hex_digits()), ascii_char(')')) + .parse(arena, state)?; + + // Advance the iterator past the expr we just parsed. + for _ in 0..(original_byte_count - new_state.bytes.len()) { + bytes.next(); } - while let Some(&byte) = bytes.next() { - segment_parsed_bytes += 1; + segments.push(StrSegment::Unicode(loc_digits)); - if (byte as char).is_ascii_hexdigit() { - // This is the most common case. - } else if byte == b')' { - // Add the segment - end_segment!(|string: &'a str| { - let value = &string[0..string.len() - 1]; - - StrSegment::Unicode(Located { - region: Region::zero(), // TODO calculate the right region - value, - }) - }); - - // We're done parsing digits now. - break; - } else { - // Unicode escapes must all be digits! - return Err(unexpected(0, state, Attempting::StrLiteral)); - } - } + // Reset the segment + segment_parsed_bytes = 0; + state = new_state; } Some(b'\\') => { escaped_char!('\\'); diff --git a/compiler/parse/tests/test_parse.rs b/compiler/parse/tests/test_parse.rs index 7754275d83..219708334d 100644 --- a/compiler/parse/tests/test_parse.rs +++ b/compiler/parse/tests/test_parse.rs @@ -50,6 +50,15 @@ mod test_parse { assert_eq!(Err(expected_fail), actual); } + fn assert_segments Vec<'_, ast::StrSegment<'_>>>(input: &str, to_expected: E) { + let arena = Bump::new(); + let actual = parse_with(&arena, arena.alloc(input)); + let expected_slice = to_expected(&arena).into_bump_slice(); + let expected_expr = Expr::Str(Line(expected_slice)); + + assert_eq!(Ok(expected_expr), actual); + } + fn parses_with_escaped_char< I: Fn(&str) -> String, E: Fn(char, &Bump) -> Vec<'_, ast::StrSegment<'_>>, @@ -162,23 +171,57 @@ mod test_parse { ); } - // INTERPOLATION + // UNICODE ESCAPES - fn assert_interpolations Vec<'_, ast::StrSegment<'_>>>( - input: &str, - to_expected: E, - ) { - let arena = Bump::new(); - let actual = parse_with(&arena, arena.alloc(input)); - let expected_slice = to_expected(&arena).into_bump_slice(); - let expected_expr = Expr::Str(Line(expected_slice)); - - assert_eq!(Ok(expected_expr), actual); + #[test] + fn unicode_escape_in_middle() { + assert_segments(r#""Hi, \u(123)!""#, |arena| { + bumpalo::vec![in arena; + Plaintext("Hi, "), + Unicode(Located::new(0, 0, 8, 11, "123")), + Plaintext("!") + ] + }); } + #[test] + fn unicode_escape_in_front() { + assert_segments(r#""\u(1234) is a unicode char""#, |arena| { + bumpalo::vec![in arena; + Unicode(Located::new(0, 0, 4, 8, "1234")), + Plaintext(" is a unicode char") + ] + }); + } + + #[test] + fn unicode_escape_in_back() { + assert_segments(r#""this is unicode: \u(1)""#, |arena| { + bumpalo::vec![in arena; + Plaintext("this is unicode: "), + Unicode(Located::new(0, 0, 21, 22, "1")) + ] + }); + } + + #[test] + fn unicode_escape_multiple() { + assert_segments(r#""\u(a1) this is \u(2Bcd) unicode \u(ef97)""#, |arena| { + bumpalo::vec![in arena; + Unicode(Located::new(0, 0, 4, 6, "a1")), + Plaintext(" this is "), + Unicode(Located::new(0, 0, 19, 23, "2Bcd")), + Plaintext(" unicode "), + Unicode(Located::new(0, 0, 36, 40, "ef97")) + ] + }); + } + + // INTERPOLATION + #[test] fn string_with_interpolation_in_middle() { - assert_interpolations(r#""Hi, \(name)!""#, |arena| { + assert_segments(r#""Hi, \(name)!""#, |arena| { let expr = arena.alloc(Var { module_name: "", ident: "name", @@ -194,7 +237,7 @@ mod test_parse { #[test] fn string_with_interpolation_in_front() { - assert_interpolations(r#""\(name), hi!""#, |arena| { + assert_segments(r#""\(name), hi!""#, |arena| { let expr = arena.alloc(Var { module_name: "", ident: "name", @@ -209,7 +252,7 @@ mod test_parse { #[test] fn string_with_interpolation_in_back() { - assert_interpolations(r#""Hello \(name)""#, |arena| { + assert_segments(r#""Hello \(name)""#, |arena| { let expr = arena.alloc(Var { module_name: "", ident: "name", @@ -224,7 +267,7 @@ mod test_parse { #[test] fn string_with_multiple_interpolations() { - assert_interpolations(r#""Hi, \(name)! How is \(project) going?""#, |arena| { + assert_segments(r#""Hi, \(name)! How is \(project) going?""#, |arena| { let expr1 = arena.alloc(Var { module_name: "", ident: "name",