Parse unicode escape sequences

2025-09-28 14:24:45 +00:00 · 2020-08-29 14:14:16 -04:00 · 2020-08-29 14:14:16 -04:00 · 5080a7e24b
commit 5080a7e24b
parent b995ccef75
4 changed files with 103 additions and 46 deletions
--- a/compiler/parse/src/ast.rs
+++ b/compiler/parse/src/ast.rs
@ -609,6 +609,7 @@ pub enum Attempting {
    Module,
    Record,
    Identifier,
+    HexDigit,
    ConcreteType,
    TypeVariable,
    WhenCondition,
--- a/compiler/parse/src/parser.rs
+++ b/compiler/parse/src/parser.rs
@ -445,6 +445,29 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
    }
 }

+/// One or more ASCII hex digits. (Useful when parsing unicode escape codes,
+/// which must consist entirely of ASCII hex digits.)
+pub fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str> {
+    move |arena, state: State<'a>| {
+        let mut buf = bumpalo::collections::String::new_in(arena);
+
+        for &byte in state.bytes.iter() {
+            if (byte as char).is_ascii_hexdigit() {
+                buf.push(byte as char);
+            } else if buf.is_empty() {
+                // We didn't find any hex digits!
+                return Err(unexpected(0, state, Attempting::Keyword));
+            } else {
+                let state = state.advance_without_indenting(buf.len())?;
+
+                return Ok((buf.into_bump_str(), state));
+            }
+        }
+
+        Err(unexpected_eof(0, Attempting::HexDigit, state))
+    }
+}
+
 /// A single UTF-8-encoded char. This will both parse *and* validate that the
 /// char is valid UTF-8, but it will *not* advance the state.
 pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
--- a/compiler/parse/src/string_literal.rs
+++ b/compiler/parse/src/string_literal.rs
@ -1,11 +1,11 @@
 use crate::ast::{Attempting, StrLiteral, StrSegment};
 use crate::expr;
 use crate::parser::{
-    allocated, ascii_char, loc, parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State,
+    allocated, ascii_char, ascii_hex_digits, loc, parse_utf8, unexpected, unexpected_eof,
+    ParseResult, Parser, State,
 };
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
-use roc_region::all::{Located, Region};

 pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
    use StrLiteral::*;
@ -176,38 +176,28 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
                            state = new_state;
                        }
                        Some(b'u') => {
-                            // This is an escaped unicode character
-                            if let Some(b'(') = bytes.next() {
-                                segment_parsed_bytes += 1;
-                            } else {
-                                // Whenever we encounter `\u` it must be followed
-                                // by a `(` char!
-                                return Err(unexpected(0, state, Attempting::StrLiteral));
+                            // Advance past the `\u` before using the expr parser
+                            state = state.advance_without_indenting(2)?;
+
+                            let original_byte_count = state.bytes.len();
+
+                            // Parse the hex digits, surrounded by parens, then
+                            // give a canonicalization error if the digits form
+                            // an invalid unicode code point.
+                            let (loc_digits, new_state) =
+                                between!(ascii_char('('), loc(ascii_hex_digits()), ascii_char(')'))
+                                    .parse(arena, state)?;
+
+                            // Advance the iterator past the expr we just parsed.
+                            for _ in 0..(original_byte_count - new_state.bytes.len()) {
+                                bytes.next();
                            }

-                            while let Some(&byte) = bytes.next() {
-                                segment_parsed_bytes += 1;
+                            segments.push(StrSegment::Unicode(loc_digits));

-                                if (byte as char).is_ascii_hexdigit() {
-                                    // This is the most common case.
-                                } else if byte == b')' {
-                                    // Add the segment
-                                    end_segment!(|string: &'a str| {
-                                        let value = &string[0..string.len() - 1];
-
-                                        StrSegment::Unicode(Located {
-                                            region: Region::zero(), // TODO calculate the right region
-                                            value,
-                                        })
-                                    });
-
-                                    // We're done parsing digits now.
-                                    break;
-                                } else {
-                                    // Unicode escapes must all be digits!
-                                    return Err(unexpected(0, state, Attempting::StrLiteral));
-                                }
-                            }
+                            // Reset the segment
+                            segment_parsed_bytes = 0;
+                            state = new_state;
                        }
                        Some(b'\\') => {
                            escaped_char!('\\');
--- a/compiler/parse/tests/test_parse.rs
+++ b/compiler/parse/tests/test_parse.rs
@ -50,6 +50,15 @@ mod test_parse {
        assert_eq!(Err(expected_fail), actual);
    }

+    fn assert_segments<E: Fn(&Bump) -> Vec<'_, ast::StrSegment<'_>>>(input: &str, to_expected: E) {
+        let arena = Bump::new();
+        let actual = parse_with(&arena, arena.alloc(input));
+        let expected_slice = to_expected(&arena).into_bump_slice();
+        let expected_expr = Expr::Str(Line(expected_slice));
+
+        assert_eq!(Ok(expected_expr), actual);
+    }
+
    fn parses_with_escaped_char<
        I: Fn(&str) -> String,
        E: Fn(char, &Bump) -> Vec<'_, ast::StrSegment<'_>>,
@ -162,23 +171,57 @@ mod test_parse {
        );
    }

-    // INTERPOLATION
+    // UNICODE ESCAPES

-    fn assert_interpolations<E: Fn(&Bump) -> Vec<'_, ast::StrSegment<'_>>>(
-        input: &str,
-        to_expected: E,
-    ) {
-        let arena = Bump::new();
-        let actual = parse_with(&arena, arena.alloc(input));
-        let expected_slice = to_expected(&arena).into_bump_slice();
-        let expected_expr = Expr::Str(Line(expected_slice));
-
-        assert_eq!(Ok(expected_expr), actual);
+    #[test]
+    fn unicode_escape_in_middle() {
+        assert_segments(r#""Hi, \u(123)!""#, |arena| {
+            bumpalo::vec![in arena;
+                Plaintext("Hi, "),
+                Unicode(Located::new(0, 0, 8, 11, "123")),
+                Plaintext("!")
+            ]
+        });
    }

+    #[test]
+    fn unicode_escape_in_front() {
+        assert_segments(r#""\u(1234) is a unicode char""#, |arena| {
+            bumpalo::vec![in arena;
+                Unicode(Located::new(0, 0, 4, 8, "1234")),
+                Plaintext(" is a unicode char")
+            ]
+        });
+    }
+
+    #[test]
+    fn unicode_escape_in_back() {
+        assert_segments(r#""this is unicode: \u(1)""#, |arena| {
+            bumpalo::vec![in arena;
+                Plaintext("this is unicode: "),
+                Unicode(Located::new(0, 0, 21, 22, "1"))
+            ]
+        });
+    }
+
+    #[test]
+    fn unicode_escape_multiple() {
+        assert_segments(r#""\u(a1) this is \u(2Bcd) unicode \u(ef97)""#, |arena| {
+            bumpalo::vec![in arena;
+                Unicode(Located::new(0, 0, 4, 6, "a1")),
+                Plaintext(" this is "),
+                Unicode(Located::new(0, 0, 19, 23, "2Bcd")),
+                Plaintext(" unicode "),
+                Unicode(Located::new(0, 0, 36, 40, "ef97"))
+            ]
+        });
+    }
+
+    // INTERPOLATION
+
    #[test]
    fn string_with_interpolation_in_middle() {
-        assert_interpolations(r#""Hi, \(name)!""#, |arena| {
+        assert_segments(r#""Hi, \(name)!""#, |arena| {
            let expr = arena.alloc(Var {
                module_name: "",
                ident: "name",
@ -194,7 +237,7 @@ mod test_parse {

    #[test]
    fn string_with_interpolation_in_front() {
-        assert_interpolations(r#""\(name), hi!""#, |arena| {
+        assert_segments(r#""\(name), hi!""#, |arena| {
            let expr = arena.alloc(Var {
                module_name: "",
                ident: "name",
@ -209,7 +252,7 @@ mod test_parse {

    #[test]
    fn string_with_interpolation_in_back() {
-        assert_interpolations(r#""Hello \(name)""#, |arena| {
+        assert_segments(r#""Hello \(name)""#, |arena| {
            let expr = arena.alloc(Var {
                module_name: "",
                ident: "name",
@ -224,7 +267,7 @@ mod test_parse {

    #[test]
    fn string_with_multiple_interpolations() {
-        assert_interpolations(r#""Hi, \(name)! How is \(project) going?""#, |arena| {
+        assert_segments(r#""Hi, \(name)! How is \(project) going?""#, |arena| {
            let expr1 = arena.alloc(Var {
                module_name: "",
                ident: "name",