improved private tag parsing

2025-09-28 06:14:46 +00:00 · 2021-03-10 17:26:42 +01:00 · 2021-03-10 17:26:42 +01:00 · ea32a37315
commit ea32a37315
parent edd54ab4ab
4 changed files with 242 additions and 70 deletions
--- a/compiler/parse/src/ident.rs
+++ b/compiler/parse/src/ident.rs
@ -299,25 +299,33 @@ pub enum BadIdent {
    WeirdDotQualified(Row, Col),
    DoubleDot(Row, Col),
    StrayDot(Row, Col),
+    StrayAt(Row, Col),
+    BadPrivateTag(Row, Col),
 }

-/// Parse an identifier into a string.
-///
-/// This is separate from the `ident` Parser because string interpolation
-/// wants to use it this way.
+fn chomp_lowercase_part(buffer: &[u8]) -> Result<&str, Progress> {
+    chomp_part(|c: char| c.is_lowercase(), buffer)
+}

-/// a `.foo` accessor function
-fn chomp_accessor(buffer: &[u8], row: Row, col: Col) -> Result<&str, BadIdent> {
+fn chomp_uppercase_part(buffer: &[u8]) -> Result<&str, Progress> {
+    chomp_part(|c: char| c.is_uppercase(), buffer)
+}
+
+#[inline(always)]
+fn chomp_part<F>(leading_is_good: F, buffer: &[u8]) -> Result<&str, Progress>
+where
+    F: Fn(char) -> bool,
+{
    // assumes the leading `.` has been chomped already
    use encode_unicode::CharExt;

    let mut chomped = 0;

    if let Ok((ch, width)) = char::from_utf8_slice_start(&buffer[chomped..]) {
-        if ch.is_lowercase() {
+        if leading_is_good(ch) {
            chomped += width;
        } else {
-            return Err(BadIdent::StrayDot(row, col + 1));
+            return Err(NoProgress);
        }
    }

@ -331,17 +339,56 @@ fn chomp_accessor(buffer: &[u8], row: Row, col: Col) -> Result<&str, BadIdent> {
    }

    if chomped == 0 {
-        Err(BadIdent::StrayDot(row, col + 1))
-    } else if let Ok(('.', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
-        Err(BadIdent::WeirdAccessor(row, col))
+        Err(NoProgress)
    } else {
        let name = unsafe { std::str::from_utf8_unchecked(&buffer[..chomped]) };

-        dbg!(name);
        Ok(name)
    }
 }

+/// a `.foo` accessor function
+fn chomp_accessor(buffer: &[u8], row: Row, col: Col) -> Result<&str, BadIdent> {
+    // assumes the leading `.` has been chomped already
+    use encode_unicode::CharExt;
+
+    match chomp_lowercase_part(buffer) {
+        Ok(name) => {
+            let chomped = name.len();
+
+            if let Ok(('.', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+                Err(BadIdent::WeirdAccessor(row, col))
+            } else {
+                Ok(name)
+            }
+        }
+        Err(_) => {
+            // we've already made progress with the initial `.`
+            Err(BadIdent::StrayDot(row, col + 1))
+        }
+    }
+}
+
+/// a `@Token` private tag
+fn chomp_private_tag(buffer: &[u8], row: Row, col: Col) -> Result<&str, BadIdent> {
+    // assumes the leading `@` has NOT been chomped already
+    debug_assert_eq!(buffer.get(0), Some(&b'@'));
+    use encode_unicode::CharExt;
+
+    match chomp_uppercase_part(&buffer[1..]) {
+        Ok(name) => {
+            let chomped = 1 + name.len();
+
+            if let Ok(('.', _)) = char::from_utf8_slice_start(&buffer[chomped..]) {
+                Err(BadIdent::BadPrivateTag(row, col + chomped as u16))
+            } else {
+                Ok(name)
+            }
+        }
+        Err(_) => Err(BadIdent::BadPrivateTag(row, col + 1)),
+    }
+}
+
 fn parse_ident_help_help<'a>(
    arena: &'a Bump,
    mut state: State<'a>,
@ -350,7 +397,6 @@ fn parse_ident_help_help<'a>(
    let mut noncapitalized_parts: Vec<&'a str> = Vec::new_in(arena);
    let mut is_capitalized;
    let is_accessor_fn;
-    let mut is_private_tag = false;

    let bytes = state.bytes;
    let mut chomped_capitalized = 0;
@ -387,35 +433,17 @@ fn parse_ident_help_help<'a>(
                    Err(fail) => return Err((MadeProgress, fail, state)),
                }
            } else if first_ch == '@' {
-                state = advance_state!(state, bytes_parsed)?;
+                match chomp_private_tag(state.bytes, state.line, state.column) {
+                    Ok(tagname) => {
+                        let bytes_parsed = 1 + tagname.len();

-                // '@' must always be followed by a capital letter!
-                match peek_utf8_char(&state) {
-                    Ok((next_ch, next_bytes_parsed)) => {
-                        if next_ch.is_uppercase() {
-                            state = advance_state!(state, next_bytes_parsed)?;
+                        state = advance_state!(state, bytes_parsed)?;

-                            part_buf.push('@');
-                            part_buf.push(next_ch);
-                            chomped_part_buf += 1 + next_bytes_parsed;
-
-                            is_private_tag = true;
-                            is_capitalized = true;
-                            is_accessor_fn = false;
-                        } else {
-                            return Err((
-                                MadeProgress,
-                                BadIdent::PrivateTagNotUppercase(state.line, state.column),
-                                state,
-                            ));
-                        }
+                        return Ok((MadeProgress, (Ident::PrivateTag(tagname), None), state));
                    }
-                    Err(_reason) => {
-                        return Err((
-                            MadeProgress,
-                            BadIdent::PrivateTagNotUppercase(state.line, state.column),
-                            state,
-                        ));
+                    Err(fail) => {
+                        state = advance_state!(state, 1)?;
+                        return Err((MadeProgress, fail, state));
                    }
                }
            } else {
@ -555,7 +583,7 @@ fn parse_ident_help_help<'a>(
    let answer = if is_accessor_fn {
        // Handle accessor functions first because they have the strictest requirements.
        // Accessor functions may have exactly 1 noncapitalized part, and no capitalzed parts.
-        if cparts == 0 && noncapitalized_parts.len() == 1 && !is_private_tag {
+        if cparts == 0 && noncapitalized_parts.len() == 1 {
            // an accessor starts with a `.`, but we drop that from the name
            let value = unsafe {
                std::str::from_utf8_unchecked(&bytes[1 + chomped..1 + chomped + chomped_part_buf])
@ -585,11 +613,7 @@ fn parse_ident_help_help<'a>(
            1 => {
                let chomped = chomped_capitalized;
                let value = unsafe { std::str::from_utf8_unchecked(&bytes[..chomped]) };
-                if is_private_tag {
-                    Ident::PrivateTag(value)
-                } else {
-                    Ident::GlobalTag(value)
-                }
+                Ident::GlobalTag(value)
            }
            _ => {
                // This is a qualified tag, which is not allowed!
@ -600,13 +624,6 @@ fn parse_ident_help_help<'a>(
                ));
            }
        }
-    } else if is_private_tag {
-        // This is qualified field access with an '@' in front, which does not make sense!
-        return Err((
-            MadeProgress,
-            BadIdent::PrivateTagFieldAccess(state.line, state.column),
-            state,
-        ));
    } else {
        // We have multiple noncapitalized parts, so this must be field access.
        let module_name = if cparts == 0 {
--- a/compiler/reporting/src/error/canonicalize.rs
+++ b/compiler/reporting/src/error/canonicalize.rs
@ -1,4 +1,5 @@
 use roc_collections::all::MutSet;
+use roc_parse::parser::{Col, Row};
 use roc_problem::can::PrecedenceProblem::BothNonAssociative;
 use roc_problem::can::{FloatErrorKind, IntErrorKind, Problem, RuntimeError};
 use roc_region::all::Region;
@ -450,14 +451,106 @@ fn to_bad_ident_expr_report<'b>(
            ])
        }

-        PrivateTagFieldAccess(_row, _col) => alloc.stack(vec![
-            alloc.reflow("I am very confused by this field access:"),
-            alloc.region(surroundings),
-            alloc.concat(vec![
-                alloc.reflow(r"It looks like a record field access on a private tag.")
-            ]),
-        ]),
-        _ => todo!(),
+        PrivateTagFieldAccess(row, col) => {
+            let region =
+                Region::from_rows_cols(surroundings.start_line, surroundings.start_col, row, col);
+            alloc.stack(vec![
+                alloc.reflow("I am very confused by this field access:"),
+                alloc.region_with_subregion(surroundings, region),
+                alloc.concat(vec![
+                    alloc.reflow(r"It looks like a record field access on a private tag.")
+                ]),
+            ])
+        }
+
+        Underscore(row, col) => {
+            let region =
+                Region::from_rows_cols(surroundings.start_line, surroundings.start_col, row, col);
+            alloc.stack(vec![
+                alloc.reflow("Underscores are not allowed in identifier names:"),
+                alloc.region_with_subregion(surroundings, region),
+                alloc.concat(vec![alloc.reflow(
+                    r"I recommend using camelCase, it is the standard in the Roc ecosystem.",
+                )]),
+            ])
+        }
+
+        DoubleDot(row, col) => {
+            let region =
+                Region::from_rows_cols(surroundings.start_line, surroundings.start_col, row, col);
+            alloc.stack(vec![
+                alloc.reflow("I am very confused by these two dots in a row:"),
+                alloc.region_with_subregion(surroundings, region),
+                alloc.concat(vec![
+                    alloc.reflow(r"There always needs to be a name after a dot.")
+                ]),
+            ])
+        }
+
+        StrayAt(row, col) => {
+            let region =
+                Region::from_rows_cols(surroundings.start_line, surroundings.start_col, row, col);
+            alloc.stack(vec![
+                alloc.reflow("I am very confused by this @ symbol"),
+                alloc.region_with_subregion(surroundings, region),
+                alloc.concat(vec![alloc.reflow(r"I expected a private tag.")]),
+            ])
+        }
+
+        BadPrivateTag(row, col) => {
+            use BadIdentNext::*;
+            match what_is_next(alloc.src_lines, row, col) {
+                LowercaseAccess(width) => {
+                    let region = Region::from_rows_cols(row, col, row, col + width);
+                    alloc.stack(vec![
+                        alloc.reflow("I am very confused by this field access:"),
+                        alloc.region_with_subregion(surroundings, region),
+                        alloc.concat(vec![
+                            alloc.reflow(r"It looks like a record field access on a private tag.")
+                        ]),
+                    ])
+                }
+                UppercaseAccess(width) => {
+                    let region = Region::from_rows_cols(row, col, row, col + width);
+                    alloc.stack(vec![
+                        alloc.reflow("I am very confused by this expression:"),
+                        alloc.region_with_subregion(surroundings, region),
+                        alloc.concat(vec![
+                            alloc.reflow(
+                                r"Looks like a private tag is treated like a module name. ",
+                            ),
+                            alloc.reflow(r"Maybe you wanted a qualified name, like "),
+                            alloc.parser_suggestion("Json.Decode.string"),
+                            alloc.text("?"),
+                        ]),
+                    ])
+                }
+                Other(Some(c)) if c.is_lowercase() => {
+                    let region = Region::from_rows_cols(
+                        surroundings.start_line,
+                        surroundings.start_col + 1,
+                        row,
+                        col + 1,
+                    );
+                    alloc.stack(vec![
+                        alloc.reflow("I am trying to parse a private tag here:"),
+                        alloc.region_with_subregion(surroundings, region),
+                        alloc.concat(vec![
+                            alloc.reflow(r"But after the "),
+                            alloc.keyword("@"),
+                            alloc.reflow(r" symbol I found a lowercase letter. "),
+                            alloc.reflow(r"All tag names (global and private)"),
+                            alloc.reflow(r" must start with an uppercase letter, like "),
+                            alloc.parser_suggestion("@UUID"),
+                            alloc.reflow(" or "),
+                            alloc.parser_suggestion("@Secrets"),
+                            alloc.reflow("."),
+                        ]),
+                    ])
+                }
+                other => todo!("{:?}", other),
+            }
+        }
    }
 }

@ -591,6 +684,69 @@ fn to_bad_ident_pattern_report<'b>(
    }
 }

+#[derive(Debug)]
+enum BadIdentNext<'a> {
+    LowercaseAccess(u16),
+    UppercaseAccess(u16),
+    NumberAccess(u16),
+    Keyword(&'a str),
+    DanglingDot,
+    Other(Option<char>),
+}
+
+fn what_is_next<'a>(source_lines: &'a [&'a str], row: Row, col: Col) -> BadIdentNext<'a> {
+    let row_index = row as usize;
+    let col_index = col as usize;
+    match source_lines.get(row_index) {
+        None => BadIdentNext::Other(None),
+        Some(line) => {
+            let chars = &line[col_index..];
+            let mut it = chars.chars();
+
+            match roc_parse::keyword::KEYWORDS
+                .iter()
+                .find(|keyword| crate::error::parse::starts_with_keyword(chars, keyword))
+            {
+                Some(keyword) => BadIdentNext::Keyword(keyword),
+                None => match it.next() {
+                    None => BadIdentNext::Other(None),
+                    Some('.') => match it.next() {
+                        Some(c) if c.is_lowercase() => {
+                            BadIdentNext::LowercaseAccess(2 + till_whitespace(it) as u16)
+                        }
+                        Some(c) if c.is_uppercase() => {
+                            BadIdentNext::UppercaseAccess(2 + till_whitespace(it) as u16)
+                        }
+                        Some(c) if c.is_ascii_digit() => {
+                            BadIdentNext::NumberAccess(2 + till_whitespace(it) as u16)
+                        }
+                        _ => BadIdentNext::DanglingDot,
+                    },
+                    Some(c) => BadIdentNext::Other(Some(c)),
+                },
+            }
+        }
+    }
+}
+
+fn till_whitespace<I>(mut it: I) -> usize
+where
+    I: Iterator<Item = char>,
+{
+    let mut chomped = 0;
+
+    while let Some(c) = it.next() {
+        if c.is_ascii_whitespace() || c == '#' {
+            break;
+        } else {
+            chomped += 1;
+            continue;
+        }
+    }
+
+    chomped
+}
+
 fn pretty_runtime_error<'b>(
    alloc: &'b RocDocAllocator<'b>,
    runtime_error: RuntimeError,
--- a/compiler/reporting/src/error/parse.rs
+++ b/compiler/reporting/src/error/parse.rs
@ -3012,7 +3012,7 @@ fn what_is_next<'a>(source_lines: &'a [&'a str], row: Row, col: Col) -> Next<'a>
    }
 }

-fn starts_with_keyword(rest_of_line: &str, keyword: &str) -> bool {
+pub fn starts_with_keyword(rest_of_line: &str, keyword: &str) -> bool {
    if let Some(stripped) = rest_of_line.strip_prefix(keyword) {
        match stripped.chars().next() {
            None => true,
--- a/compiler/reporting/tests/test_reporting.rs
+++ b/compiler/reporting/tests/test_reporting.rs
@ -4148,14 +4148,13 @@ mod test_reporting {
                r#"
                ── SYNTAX PROBLEM ──────────────────────────────────────────────────────────────
                
-                I am trying to parse a qualified name here:
+                I am very confused by this expression:
                
                1│  @Foo.Bar
-                            ^
+                        ^^^^
                
-                This looks like a qualified tag name to me, but tags cannot be
-                qualified! Maybe you wanted a qualified name, something like
-                Json.Decode.string?
+                Looks like a private tag is treated like a module name. Maybe you
+                wanted a qualified name, like Json.Decode.string?
            "#
            ),
        )
@ -5523,7 +5522,7 @@ mod test_reporting {
                I am very confused by this field access:

                1│  @UUID.bar
-                    ^^^^^^^^^
+                         ^^^^

                It looks like a record field access on a private tag.
            "#