Parsing support for snake_case identifiers

In this initial commit, I have done the following: - Added unit tests to roc_parse's ident.rs file to cover at least the simplest Ident enum cases (Tag, OpaqueRef, and simple Access) - Added '_' as a valid "rest" character in both uppercase and lowercase identifier parts - Updated the test_syntax snapshots appropriately There is still a lot left to do here. Such as: - Do we want to allow multiple '_'s to parse successfully? - Handle qualified access - Handle accessor functions - Handle record update functions - Remove the UnderscoreInMiddle case from BadIdent - Write unit tests for Malformed Idents I am not a "Rustacean" by any means, but have been through the Book in years past. Any feedback on the way I wrote the tests or any other part of the implementation would be very appreciated.
2025-09-19 01:59:48 +00:00 · 2024-11-20 09:00:57 -06:00 · 2024-11-20 09:00:57 -06:00 · a2083cec30
commit a2083cec30
parent d7825428df
31 changed files with 1214 additions and 460 deletions
--- a/crates/compiler/parse/src/ident.rs
+++ b/crates/compiler/parse/src/ident.rs
@ -231,6 +231,7 @@ pub enum BadIdent {

    UnderscoreAlone(Position),
    UnderscoreInMiddle(Position),
+    TooManyUnderscores(Position),
    UnderscoreAtStart {
        position: Position,
        /// If this variable was already declared in a pattern (e.g. \_x -> _x),
@ -252,11 +253,21 @@ fn is_alnum(ch: char) -> bool {
 }

 fn chomp_lowercase_part(buffer: &[u8]) -> Result<&str, Progress> {
-    chomp_part(char::is_lowercase, is_alnum, true, buffer)
+    chomp_part(
+        char::is_lowercase,
+        is_plausible_ident_continue,
+        true,
+        buffer,
+    )
 }

 fn chomp_uppercase_part(buffer: &[u8]) -> Result<&str, Progress> {
-    chomp_part(char::is_uppercase, is_alnum, false, buffer)
+    chomp_part(
+        char::is_uppercase,
+        is_plausible_ident_continue,
+        false,
+        buffer,
+    )
 }

 fn chomp_anycase_part(buffer: &[u8]) -> Result<&str, Progress> {
@ -265,7 +276,12 @@ fn chomp_anycase_part(buffer: &[u8]) -> Result<&str, Progress> {
    let allow_bang =
        char::from_utf8_slice_start(buffer).map_or(false, |(leading, _)| leading.is_lowercase());

-    chomp_part(char::is_alphabetic, is_alnum, allow_bang, buffer)
+    chomp_part(
+        char::is_alphabetic,
+        is_plausible_ident_continue,
+        allow_bang,
+        buffer,
+    )
 }

 fn chomp_integer_part(buffer: &[u8]) -> Result<&str, Progress> {
@ -429,7 +445,14 @@ fn chomp_opaque_ref(buffer: &[u8], pos: Position) -> Result<&str, BadIdent> {
                Err(bad_ident(pos.bump_column(width as u32)))
            } else {
                let value = unsafe { std::str::from_utf8_unchecked(&buffer[..width]) };
-                Ok(value)
+                if value.contains('_') {
+                    // we don't allow underscores in the middle of an identifier
+                    // but still parse them (and generate a malformed identifier)
+                    // to give good error messages for this case
+                    Err(BadIdent::UnderscoreInMiddle(pos.bump_column(width as u32)))
+                } else {
+                    Ok(value)
+                }
            }
        }
        Err(_) => Err(bad_ident(pos.bump_column(1))),
@ -486,7 +509,7 @@ fn chomp_identifier_chain<'a>(
    }

    while let Ok((ch, width)) = char::from_utf8_slice_start(&buffer[chomped..]) {
-        if ch.is_alphabetic() || ch.is_ascii_digit() {
+        if ch.is_alphabetic() || ch.is_ascii_digit() || ch == '_' {
            chomped += width;
        } else if ch == '!' && !first_is_uppercase {
            chomped += width;
@ -556,19 +579,20 @@ fn chomp_identifier_chain<'a>(
                BadIdent::WeirdDotAccess(pos.bump_column(chomped as u32 + width)),
            )),
        }
-    } else if let Ok(('_', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
-        // we don't allow underscores in the middle of an identifier
-        // but still parse them (and generate a malformed identifier)
-        // to give good error messages for this case
-        Err((
-            chomped as u32 + 1,
-            BadIdent::UnderscoreInMiddle(pos.bump_column(chomped as u32 + 1)),
-        ))
    } else if first_is_uppercase {
        // just one segment, starting with an uppercase letter; that's a tag
        let value = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
-
-        Ok((chomped as u32, Ident::Tag(value)))
+        if value.contains('_') {
+            // we don't allow underscores in the middle of an identifier
+            // but still parse them (and generate a malformed identifier)
+            // to give good error messages for this case
+            Err((
+                chomped as u32,
+                BadIdent::UnderscoreInMiddle(pos.bump_column(chomped as u32)),
+            ))
+        } else {
+            Ok((chomped as u32, Ident::Tag(value)))
+        }
    } else {
        // just one segment, starting with a lowercase letter; that's a normal identifier
        let value = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
@ -689,3 +713,121 @@ fn chomp_access_chain<'a>(buffer: &'a [u8], parts: &mut Vec<'a, Accessor<'a>>) -
        Ok(chomped as u32)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn assert_ident_parses<'a>(arena: &'a Bump, ident: &str, expected: Ident<'a>) {
+        let s = State::new(ident.as_bytes());
+        let (_, id, _) = parse_ident(arena, s, 0).unwrap();
+        assert_eq!(id, expected);
+    }
+
+    fn assert_ident_parses_tag(arena: &Bump, ident: &str) {
+        assert_ident_parses(arena, ident, Ident::Tag(ident));
+    }
+    fn assert_ident_parses_opaque(arena: &Bump, ident: &str) {
+        assert_ident_parses(arena, ident, Ident::OpaqueRef(ident));
+    }
+    fn assert_ident_parses_simple_access(arena: &Bump, ident: &str) {
+        assert_ident_parses(
+            arena,
+            ident,
+            Ident::Access {
+                module_name: "",
+                parts: arena.alloc([Accessor::RecordField(ident)]),
+            },
+        );
+    }
+
+    fn assert_ident_parses_malformed(arena: &Bump, ident: &str, pos: Position) {
+        assert_ident_parses(
+            arena,
+            ident,
+            Ident::Malformed(ident, BadIdent::UnderscoreInMiddle(pos)),
+        );
+    }
+
+    #[test]
+    fn test_parse_ident_lowercase_camel() {
+        let arena = Bump::new();
+        assert_ident_parses_simple_access(&arena, "hello");
+        assert_ident_parses_simple_access(&arena, "hello23");
+        assert_ident_parses_simple_access(&arena, "helloWorld");
+        assert_ident_parses_simple_access(&arena, "helloWorld23");
+        assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag");
+        assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag_");
+        assert_ident_parses_simple_access(&arena, "helloworldthisisquiteatag_");
+        assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag23");
+        assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag23_");
+        assert_ident_parses_simple_access(&arena, "helloworldthisisquiteatag23_");
+    }
+
+    #[test]
+    fn test_parse_ident_lowercase_snake() {
+        let arena = Bump::new();
+        assert_ident_parses_simple_access(&arena, "hello_world");
+        assert_ident_parses_simple_access(&arena, "hello_world23");
+        assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag");
+        assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag_");
+        assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag23");
+        assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag23_");
+    }
+
+    #[test]
+    fn test_parse_tag_camel() {
+        let arena = Bump::new();
+        assert_ident_parses_tag(&arena, "Hello");
+        assert_ident_parses_tag(&arena, "Hello23");
+        assert_ident_parses_tag(&arena, "HelloWorld");
+        assert_ident_parses_tag(&arena, "HelloWorld23");
+        assert_ident_parses_tag(&arena, "HelloWorldThisIsQuiteATag");
+        assert_ident_parses_tag(&arena, "HelloWorldThisIsQuiteATag23");
+    }
+
+    #[test]
+    fn test_parse_tag_snake_is_malformed() {
+        let arena = Bump::new();
+        assert_ident_parses_malformed(&arena, "Hello_World", Position { offset: 11 });
+        assert_ident_parses_malformed(&arena, "Hello_World23", Position { offset: 13 });
+        assert_ident_parses_malformed(
+            &arena,
+            "Hello_World_This_Is_Quite_A_Tag",
+            Position { offset: 31 },
+        );
+        assert_ident_parses_malformed(
+            &arena,
+            "Hello_World_This_Is_Quite_A_Tag23",
+            Position { offset: 33 },
+        );
+    }
+
+    #[test]
+    fn test_parse_opaque_ref_camel() {
+        let arena = Bump::new();
+        assert_ident_parses_opaque(&arena, "@Hello");
+        assert_ident_parses_opaque(&arena, "@Hello23");
+        assert_ident_parses_opaque(&arena, "@HelloWorld");
+        assert_ident_parses_opaque(&arena, "@HelloWorld23");
+        assert_ident_parses_opaque(&arena, "@HelloWorldThisIsQuiteARef");
+        assert_ident_parses_opaque(&arena, "@HelloWorldThisIsQuiteARef23");
+    }
+
+    #[test]
+    fn test_parse_opaque_ref_snake_is_malformed() {
+        let arena = Bump::new();
+        assert_ident_parses_malformed(&arena, "@Hello_World", Position { offset: 12 });
+        assert_ident_parses_malformed(&arena, "@Hello_World23", Position { offset: 14 });
+        assert_ident_parses_malformed(
+            &arena,
+            "@Hello_World_This_Is_Quite_A_Ref",
+            Position { offset: 32 },
+        );
+        assert_ident_parses_malformed(
+            &arena,
+            "@Hello_World_This_Is_Quite_A_Ref23",
+            Position { offset: 34 },
+        );
+    }
+}
--- a/crates/compiler/parse/src/normalize.rs
+++ b/crates/compiler/parse/src/normalize.rs
@ -855,6 +855,7 @@ fn remove_spaces_bad_ident(ident: BadIdent) -> BadIdent {
            position: Position::zero(),
            declaration_region,
        },
+        BadIdent::TooManyUnderscores(_) => BadIdent::TooManyUnderscores(Position::zero()),
        BadIdent::QualifiedTag(_) => BadIdent::QualifiedTag(Position::zero()),
        BadIdent::WeirdAccessor(_) => BadIdent::WeirdAccessor(Position::zero()),
        BadIdent::WeirdDotAccess(_) => BadIdent::WeirdDotAccess(Position::zero()),