optimize type parser

2025-09-29 06:44:46 +00:00 · 2021-03-11 01:48:05 +01:00 · 2021-03-11 01:48:05 +01:00 · 92cff4c32a
commit 92cff4c32a
parent 30b47b9593
2 changed files with 83 additions and 162 deletions
--- a/compiler/parse/src/ident.rs
+++ b/compiler/parse/src/ident.rs
@ -1,14 +1,7 @@
 use crate::ast::Attempting;
 use crate::keyword;
 use crate::parser::Progress::{self, *};
-use crate::parser::{
+use crate::parser::{BadInputError, Col, EExpr, ParseResult, Parser, Row, State};
    peek_utf8_char, unexpected, BadInputError, Col, EExpr, ParseResult, Parser, Row, State,
    SyntaxError,
 };
 use bumpalo::collections::string::String;
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
 use roc_region::all::Region;
 /// The parser accepts all of these in any position where any one of them could
 /// appear. This way, canonicalization can give more helpful error messages like
@ -66,21 +59,20 @@ impl<'a> Ident<'a> {
 /// * A record field, e.g. "email" in `.email` or in `email:`
 /// * A named pattern match, e.g. "foo" in `foo =` or `foo ->` or `\foo ->`
 pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
-    debug!(
+    move |_, state: State<'a>| match chomp_lowercase_part(state.bytes) {
-        move |_, mut state: State<'a>| match chomp_lowercase_part(state.bytes) {
+        Err(progress) => Err((progress, (), state)),
-            Err(progress) => Err((progress, (), state)),
+        Ok(ident) => {
-            Ok(ident) => {
+            if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
-                if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
+                Err((MadeProgress, (), state))
-                    Err((MadeProgress, (), state))
+            } else {
-                } else {
+                let width = ident.len();
-                    let width = ident.len();
+                match state.advance_without_indenting_ee(width, |_, _| ()) {
-                    state.column += width as u16;
+                    Ok(state) => Ok((MadeProgress, ident, state)),
-                    state.bytes = &state.bytes[width..];
+                    Err(bad) => Err(bad),
                    Ok((MadeProgress, ident, state))
                }
            }
        }
-    )
+    }
 }
 /// This could be:
@ -89,51 +81,35 @@ pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
 /// * A type name
 /// * A global tag
 pub fn uppercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
-    move |_, mut state: State<'a>| match chomp_uppercase_part(state.bytes) {
+    move |_, state: State<'a>| match chomp_uppercase_part(state.bytes) {
        Err(progress) => Err((progress, (), state)),
        Ok(ident) => {
            let width = ident.len();
-            state.column += width as u16;
+            match state.advance_without_indenting_ee(width, |_, _| ()) {
-            state.bytes = &state.bytes[width..];
+                Ok(state) => Ok((MadeProgress, ident, state)),
-            Ok((MadeProgress, ident, state))
+                Err(bad) => Err(bad),
            }
        }
    }
 }
 pub fn unqualified_ident<'a>() -> impl Parser<'a, &'a str, ()> {
-    move |_, mut state: State<'a>| match chomp_part(|c| c.is_alphabetic(), state.bytes) {
+    move |_, state: State<'a>| match chomp_part(|c| c.is_alphabetic(), state.bytes) {
        Err(progress) => Err((progress, (), state)),
        Ok(ident) => {
            if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
                Err((MadeProgress, (), state))
            } else {
                let width = ident.len();
-                state.column += width as u16;
+                match state.advance_without_indenting_ee(width, |_, _| ()) {
-                state.bytes = &state.bytes[width..];
+                    Ok(state) => Ok((MadeProgress, ident, state)),
-                Ok((MadeProgress, ident, state))
+                    Err(bad) => Err(bad),
                }
            }
        }
    }
 }
 pub fn join_module_parts<'a>(arena: &'a Bump, module_parts: &[&str]) -> &'a str {
    let capacity = module_parts.len() * 3; // Module parts tend to be 3+ characters.
    let mut buf = String::with_capacity_in(capacity, arena);
    let mut any_parts_added = false;
    for part in module_parts {
        if any_parts_added {
            buf.push('.');
        } else {
            any_parts_added = true;
        }
        buf.push_str(part);
    }
    buf.into_bump_str()
 }
 macro_rules! advance_state {
    ($state:expr, $n:expr) => {
        $state.advance_without_indenting_ee($n, |r, c| {
@ -185,30 +161,26 @@ fn malformed_identifier<'a>(
    _arena: &'a Bump,
    mut state: State<'a>,
 ) -> ParseResult<'a, Ident<'a>, EExpr<'a>> {
    use encode_unicode::CharExt;
    // skip forward to the next non-identifier character
-    while !state.bytes.is_empty() {
+    let mut chomped = 0;
-        match peek_utf8_char(&state) {
+    while let Ok((ch, width)) = char::from_utf8_slice_start(&state.bytes[chomped..]) {
-            Ok((ch, bytes_parsed)) => {
+        // We can't use ch.is_alphanumeric() here because that passes for
-                // We can't use ch.is_alphanumeric() here because that passes for
+        // things that are "numeric" but not ASCII digits, like `¾`
-                // things that are "numeric" but not ASCII digits, like `¾`
+        if ch == '.' || ch == '_' || ch.is_alphabetic() || ch.is_ascii_digit() {
-                if ch == '.' || ch == '_' || ch.is_alphabetic() || ch.is_ascii_digit() {
+            chomped += width;
-                    state = state.advance_without_indenting_ee(bytes_parsed, |r, c| {
+            continue;
-                        EExpr::Space(crate::parser::BadInputError::LineTooLong, r, c)
+        } else {
-                    })?;
+            break;
                    continue;
                } else {
                    break;
                }
            }
            Err(_reason) => {
                break;
            }
        }
    }
-    let parsed = &initial_bytes[..(initial_bytes.len() - state.bytes.len())];
+    let delta = initial_bytes.len() - state.bytes.len();
    let parsed_str = unsafe { std::str::from_utf8_unchecked(&initial_bytes[..chomped + delta]) };
-    let parsed_str = unsafe { std::str::from_utf8_unchecked(parsed) };
+    state = state.advance_without_indenting_ee(chomped, |r, c| {
        EExpr::Space(crate::parser::BadInputError::LineTooLong, r, c)
    })?;
    Ok((MadeProgress, Ident::Malformed(parsed_str, problem), state))
 }
@ -240,7 +212,6 @@ fn chomp_part<F>(leading_is_good: F, buffer: &[u8]) -> Result<&str, Progress>
 where
    F: Fn(char) -> bool,
 {
    // assumes the leading `.` has been chomped already
    use encode_unicode::CharExt;
    let mut chomped = 0;
@ -455,6 +426,47 @@ fn chomp_module_chain<'a>(buffer: &'a [u8]) -> Result<u16, Progress> {
    }
 }
 pub fn concrete_type<'a>() -> impl Parser<'a, (&'a str, &'a str), ()> {
    move |_, state: State<'a>| match chomp_concrete_type(state.bytes) {
        Err(progress) => Err((progress, (), state)),
        Ok((module_name, type_name, width)) => {
            match state.advance_without_indenting_ee(width, |_, _| ()) {
                Ok(state) => Ok((MadeProgress, (module_name, type_name), state)),
                Err(bad) => Err(bad),
            }
        }
    }
 }
 // parse a type name like `Result` or `Result.Result`
 fn chomp_concrete_type<'a>(buffer: &'a [u8]) -> Result<(&'a str, &'a str, usize), Progress> {
    let first = crate::ident::chomp_uppercase_part(buffer)?;
    if let Some(b'.') = buffer.get(first.len()) {
        match crate::ident::chomp_module_chain(&buffer[first.len()..]) {
            Err(_) => Err(MadeProgress),
            Ok(rest) => {
                let width = first.len() + rest as usize;
                let slice = &buffer[..width];
                match slice.iter().rev().position(|c| *c == b'.') {
                    None => Ok(("", first, first.len())),
                    Some(rev_index) => {
                        let index = slice.len() - rev_index;
                        let module_name =
                            unsafe { std::str::from_utf8_unchecked(&slice[..index - 1]) };
                        let type_name = unsafe { std::str::from_utf8_unchecked(&slice[index..]) };
                        Ok((module_name, type_name, width))
                    }
                }
            }
        }
    } else {
        Ok(("", first, first.len()))
    }
 }
 fn chomp_access_chain<'a>(buffer: &'a [u8], parts: &mut Vec<'a, &'a str>) -> Result<u16, u16> {
    let mut chomped = 0;
--- a/compiler/parse/src/type_annotation.rs
+++ b/compiler/parse/src/type_annotation.rs
@ -1,6 +1,5 @@
 use crate::ast::{AssignedField, Tag, TypeAnnotation};
 use crate::blankspace::{space0_around_ee, space0_before_e, space0_e};
 use crate::ident::join_module_parts;
 use crate::keyword;
 use crate::parser::{
    allocated, backtrackable, not_e, optional, peek_utf8_char_e, specialize, specialize_ref, word1,
@ -517,102 +516,12 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>
 fn parse_concrete_type<'a>(
    arena: &'a Bump,
-    mut state: State<'a>,
+    state: State<'a>,
 ) -> ParseResult<'a, TypeAnnotation<'a>, TApply> {
-    let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
+    let (_, (module_name, type_name), state) =
-    let mut parts: Vec<&'a str> = Vec::new_in(arena);
+        specialize(|_, r, c| TApply::End(r, c), crate::ident::concrete_type())
-
+            .parse(arena, state)?;
-    // Qualified types must start with a capitalized letter.
+    let answer = TypeAnnotation::Apply(module_name, type_name, &[]);
    match peek_utf8_char_e(&state, TApply::StartNotUppercase, TApply::Space) {
        Ok((first_letter, bytes_parsed)) => {
            if first_letter.is_alphabetic() && first_letter.is_uppercase() {
                part_buf.push(first_letter);
            } else {
                let problem = TApply::StartNotUppercase(state.line, state.column + 1);
                return Err((NoProgress, problem, state));
            }
            state = state.advance_without_indenting_e(bytes_parsed, TApply::Space)?;
        }
        Err(reason) => return Err((NoProgress, reason, state)),
    }
    while !state.bytes.is_empty() {
        match peek_utf8_char_e(&state, TApply::End, TApply::Space) {
            Ok((ch, bytes_parsed)) => {
                // After the first character, only these are allowed:
                //
                // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
                // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
                // * A dot ('.')
                if ch.is_alphabetic() {
                    if part_buf.is_empty() && !ch.is_uppercase() {
                        // Each part must begin with a capital letter.
                        return Err((
                            MadeProgress,
                            TApply::StartNotUppercase(state.line, state.column),
                            state,
                        ));
                    }
                    part_buf.push(ch);
                } else if ch.is_ascii_digit() {
                    // Parts may not start with numbers!
                    if part_buf.is_empty() {
                        return Err((
                            MadeProgress,
                            TApply::StartIsNumber(state.line, state.column),
                            state,
                        ));
                    }
                    part_buf.push(ch);
                } else if ch == '.' {
                    // Having two consecutive dots is an error.
                    if part_buf.is_empty() {
                        return Err((
                            MadeProgress,
                            TApply::DoubleDot(state.line, state.column),
                            state,
                        ));
                    }
                    parts.push(part_buf.into_bump_str());
                    // Now that we've recorded the contents of the current buffer, reset it.
                    part_buf = String::new_in(arena);
                } else {
                    // This must be the end of the type. We're done!
                    break;
                }
                state = state.advance_without_indenting_e(bytes_parsed, TApply::Space)?;
            }
            Err(reason) => {
                return Err((MadeProgress, reason, state));
            }
        }
    }
    if part_buf.is_empty() {
        // We probably had a trailing dot, e.g. `Foo.bar.` - this is malformed!
        //
        // This condition might also occur if we encounter a malformed accessor like `.|`
        //
        // If we made it this far and don't have a next_char, then necessarily
        // we have consumed a '.' char previously.
        return Err((
            MadeProgress,
            TApply::TrailingDot(state.line, state.column),
            state,
        ));
    }
    let answer = TypeAnnotation::Apply(
        join_module_parts(arena, parts.into_bump_slice()),
        part_buf.into_bump_str(),
        &[],
    );
    Ok((MadeProgress, answer, state))
 }