Parsing support for snake_case identifiers

In this initial commit, I have done the following:

- Added unit tests to roc_parse's ident.rs file to cover at least the
  simplest Ident enum cases (Tag, OpaqueRef, and simple Access)
- Added '_' as a valid "rest" character in both uppercase and lowercase
  identifier parts
- Updated the test_syntax snapshots appropriately

There is still a lot left to do here. Such as:

- Do we want to allow multiple '_'s to parse successfully?
- Handle qualified access
- Handle accessor functions
- Handle record update functions
- Remove the UnderscoreInMiddle case from BadIdent
- Write unit tests for Malformed Idents

I am not a "Rustacean" by any means, but have been through the Book in
years past.  Any feedback on the way I wrote the tests or any other part
of the implementation would be very appreciated.
This commit is contained in:
Anthony Bullard 2024-11-20 09:00:57 -06:00
parent d7825428df
commit a2083cec30
No known key found for this signature in database
31 changed files with 1214 additions and 460 deletions

View file

@ -231,6 +231,7 @@ pub enum BadIdent {
UnderscoreAlone(Position),
UnderscoreInMiddle(Position),
TooManyUnderscores(Position),
UnderscoreAtStart {
position: Position,
/// If this variable was already declared in a pattern (e.g. \_x -> _x),
@ -252,11 +253,21 @@ fn is_alnum(ch: char) -> bool {
}
fn chomp_lowercase_part(buffer: &[u8]) -> Result<&str, Progress> {
chomp_part(char::is_lowercase, is_alnum, true, buffer)
chomp_part(
char::is_lowercase,
is_plausible_ident_continue,
true,
buffer,
)
}
fn chomp_uppercase_part(buffer: &[u8]) -> Result<&str, Progress> {
chomp_part(char::is_uppercase, is_alnum, false, buffer)
chomp_part(
char::is_uppercase,
is_plausible_ident_continue,
false,
buffer,
)
}
fn chomp_anycase_part(buffer: &[u8]) -> Result<&str, Progress> {
@ -265,7 +276,12 @@ fn chomp_anycase_part(buffer: &[u8]) -> Result<&str, Progress> {
let allow_bang =
char::from_utf8_slice_start(buffer).map_or(false, |(leading, _)| leading.is_lowercase());
chomp_part(char::is_alphabetic, is_alnum, allow_bang, buffer)
chomp_part(
char::is_alphabetic,
is_plausible_ident_continue,
allow_bang,
buffer,
)
}
fn chomp_integer_part(buffer: &[u8]) -> Result<&str, Progress> {
@ -429,7 +445,14 @@ fn chomp_opaque_ref(buffer: &[u8], pos: Position) -> Result<&str, BadIdent> {
Err(bad_ident(pos.bump_column(width as u32)))
} else {
let value = unsafe { std::str::from_utf8_unchecked(&buffer[..width]) };
Ok(value)
if value.contains('_') {
// we don't allow underscores in the middle of an identifier
// but still parse them (and generate a malformed identifier)
// to give good error messages for this case
Err(BadIdent::UnderscoreInMiddle(pos.bump_column(width as u32)))
} else {
Ok(value)
}
}
}
Err(_) => Err(bad_ident(pos.bump_column(1))),
@ -486,7 +509,7 @@ fn chomp_identifier_chain<'a>(
}
while let Ok((ch, width)) = char::from_utf8_slice_start(&buffer[chomped..]) {
if ch.is_alphabetic() || ch.is_ascii_digit() {
if ch.is_alphabetic() || ch.is_ascii_digit() || ch == '_' {
chomped += width;
} else if ch == '!' && !first_is_uppercase {
chomped += width;
@ -556,19 +579,20 @@ fn chomp_identifier_chain<'a>(
BadIdent::WeirdDotAccess(pos.bump_column(chomped as u32 + width)),
)),
}
} else if let Ok(('_', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
// we don't allow underscores in the middle of an identifier
// but still parse them (and generate a malformed identifier)
// to give good error messages for this case
Err((
chomped as u32 + 1,
BadIdent::UnderscoreInMiddle(pos.bump_column(chomped as u32 + 1)),
))
} else if first_is_uppercase {
// just one segment, starting with an uppercase letter; that's a tag
let value = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
Ok((chomped as u32, Ident::Tag(value)))
if value.contains('_') {
// we don't allow underscores in the middle of an identifier
// but still parse them (and generate a malformed identifier)
// to give good error messages for this case
Err((
chomped as u32,
BadIdent::UnderscoreInMiddle(pos.bump_column(chomped as u32)),
))
} else {
Ok((chomped as u32, Ident::Tag(value)))
}
} else {
// just one segment, starting with a lowercase letter; that's a normal identifier
let value = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };
@ -689,3 +713,121 @@ fn chomp_access_chain<'a>(buffer: &'a [u8], parts: &mut Vec<'a, Accessor<'a>>) -
Ok(chomped as u32)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn assert_ident_parses<'a>(arena: &'a Bump, ident: &str, expected: Ident<'a>) {
let s = State::new(ident.as_bytes());
let (_, id, _) = parse_ident(arena, s, 0).unwrap();
assert_eq!(id, expected);
}
fn assert_ident_parses_tag(arena: &Bump, ident: &str) {
assert_ident_parses(arena, ident, Ident::Tag(ident));
}
fn assert_ident_parses_opaque(arena: &Bump, ident: &str) {
assert_ident_parses(arena, ident, Ident::OpaqueRef(ident));
}
fn assert_ident_parses_simple_access(arena: &Bump, ident: &str) {
assert_ident_parses(
arena,
ident,
Ident::Access {
module_name: "",
parts: arena.alloc([Accessor::RecordField(ident)]),
},
);
}
fn assert_ident_parses_malformed(arena: &Bump, ident: &str, pos: Position) {
assert_ident_parses(
arena,
ident,
Ident::Malformed(ident, BadIdent::UnderscoreInMiddle(pos)),
);
}
#[test]
fn test_parse_ident_lowercase_camel() {
let arena = Bump::new();
assert_ident_parses_simple_access(&arena, "hello");
assert_ident_parses_simple_access(&arena, "hello23");
assert_ident_parses_simple_access(&arena, "helloWorld");
assert_ident_parses_simple_access(&arena, "helloWorld23");
assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag");
assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag_");
assert_ident_parses_simple_access(&arena, "helloworldthisisquiteatag_");
assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag23");
assert_ident_parses_simple_access(&arena, "helloWorldThisIsQuiteATag23_");
assert_ident_parses_simple_access(&arena, "helloworldthisisquiteatag23_");
}
#[test]
fn test_parse_ident_lowercase_snake() {
let arena = Bump::new();
assert_ident_parses_simple_access(&arena, "hello_world");
assert_ident_parses_simple_access(&arena, "hello_world23");
assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag");
assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag_");
assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag23");
assert_ident_parses_simple_access(&arena, "hello_world_this_is_quite_a_tag23_");
}
#[test]
fn test_parse_tag_camel() {
let arena = Bump::new();
assert_ident_parses_tag(&arena, "Hello");
assert_ident_parses_tag(&arena, "Hello23");
assert_ident_parses_tag(&arena, "HelloWorld");
assert_ident_parses_tag(&arena, "HelloWorld23");
assert_ident_parses_tag(&arena, "HelloWorldThisIsQuiteATag");
assert_ident_parses_tag(&arena, "HelloWorldThisIsQuiteATag23");
}
#[test]
fn test_parse_tag_snake_is_malformed() {
let arena = Bump::new();
assert_ident_parses_malformed(&arena, "Hello_World", Position { offset: 11 });
assert_ident_parses_malformed(&arena, "Hello_World23", Position { offset: 13 });
assert_ident_parses_malformed(
&arena,
"Hello_World_This_Is_Quite_A_Tag",
Position { offset: 31 },
);
assert_ident_parses_malformed(
&arena,
"Hello_World_This_Is_Quite_A_Tag23",
Position { offset: 33 },
);
}
#[test]
fn test_parse_opaque_ref_camel() {
let arena = Bump::new();
assert_ident_parses_opaque(&arena, "@Hello");
assert_ident_parses_opaque(&arena, "@Hello23");
assert_ident_parses_opaque(&arena, "@HelloWorld");
assert_ident_parses_opaque(&arena, "@HelloWorld23");
assert_ident_parses_opaque(&arena, "@HelloWorldThisIsQuiteARef");
assert_ident_parses_opaque(&arena, "@HelloWorldThisIsQuiteARef23");
}
#[test]
fn test_parse_opaque_ref_snake_is_malformed() {
let arena = Bump::new();
assert_ident_parses_malformed(&arena, "@Hello_World", Position { offset: 12 });
assert_ident_parses_malformed(&arena, "@Hello_World23", Position { offset: 14 });
assert_ident_parses_malformed(
&arena,
"@Hello_World_This_Is_Quite_A_Ref",
Position { offset: 32 },
);
assert_ident_parses_malformed(
&arena,
"@Hello_World_This_Is_Quite_A_Ref23",
Position { offset: 34 },
);
}
}

View file

@ -855,6 +855,7 @@ fn remove_spaces_bad_ident(ident: BadIdent) -> BadIdent {
position: Position::zero(),
declaration_region,
},
BadIdent::TooManyUnderscores(_) => BadIdent::TooManyUnderscores(Position::zero()),
BadIdent::QualifiedTag(_) => BadIdent::QualifiedTag(Position::zero()),
BadIdent::WeirdAccessor(_) => BadIdent::WeirdAccessor(Position::zero()),
BadIdent::WeirdDotAccess(_) => BadIdent::WeirdDotAccess(Position::zero()),