diff --git a/compiler/parse/src/ident.rs b/compiler/parse/src/ident.rs index 3413db0cac..dc633c953a 100644 --- a/compiler/parse/src/ident.rs +++ b/compiler/parse/src/ident.rs @@ -1,14 +1,7 @@ -use crate::ast::Attempting; -use crate::keyword; use crate::parser::Progress::{self, *}; -use crate::parser::{ - peek_utf8_char, unexpected, BadInputError, Col, EExpr, ParseResult, Parser, Row, State, - SyntaxError, -}; -use bumpalo::collections::string::String; +use crate::parser::{BadInputError, Col, EExpr, ParseResult, Parser, Row, State}; use bumpalo::collections::vec::Vec; use bumpalo::Bump; -use roc_region::all::Region; /// The parser accepts all of these in any position where any one of them could /// appear. This way, canonicalization can give more helpful error messages like @@ -66,21 +59,20 @@ impl<'a> Ident<'a> { /// * A record field, e.g. "email" in `.email` or in `email:` /// * A named pattern match, e.g. "foo" in `foo =` or `foo ->` or `\foo ->` pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> { - debug!( - move |_, mut state: State<'a>| match chomp_lowercase_part(state.bytes) { - Err(progress) => Err((progress, (), state)), - Ok(ident) => { - if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) { - Err((MadeProgress, (), state)) - } else { - let width = ident.len(); - state.column += width as u16; - state.bytes = &state.bytes[width..]; - Ok((MadeProgress, ident, state)) + move |_, state: State<'a>| match chomp_lowercase_part(state.bytes) { + Err(progress) => Err((progress, (), state)), + Ok(ident) => { + if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) { + Err((MadeProgress, (), state)) + } else { + let width = ident.len(); + match state.advance_without_indenting_ee(width, |_, _| ()) { + Ok(state) => Ok((MadeProgress, ident, state)), + Err(bad) => Err(bad), } } } - ) + } } /// This could be: @@ -89,51 +81,35 @@ pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> { /// * A type name /// * A global tag pub fn uppercase_ident<'a>() -> impl Parser<'a, &'a str, ()> { - move |_, mut state: State<'a>| match chomp_uppercase_part(state.bytes) { + move |_, state: State<'a>| match chomp_uppercase_part(state.bytes) { Err(progress) => Err((progress, (), state)), Ok(ident) => { let width = ident.len(); - state.column += width as u16; - state.bytes = &state.bytes[width..]; - Ok((MadeProgress, ident, state)) + match state.advance_without_indenting_ee(width, |_, _| ()) { + Ok(state) => Ok((MadeProgress, ident, state)), + Err(bad) => Err(bad), + } } } } pub fn unqualified_ident<'a>() -> impl Parser<'a, &'a str, ()> { - move |_, mut state: State<'a>| match chomp_part(|c| c.is_alphabetic(), state.bytes) { + move |_, state: State<'a>| match chomp_part(|c| c.is_alphabetic(), state.bytes) { Err(progress) => Err((progress, (), state)), Ok(ident) => { if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) { Err((MadeProgress, (), state)) } else { let width = ident.len(); - state.column += width as u16; - state.bytes = &state.bytes[width..]; - Ok((MadeProgress, ident, state)) + match state.advance_without_indenting_ee(width, |_, _| ()) { + Ok(state) => Ok((MadeProgress, ident, state)), + Err(bad) => Err(bad), + } } } } } -pub fn join_module_parts<'a>(arena: &'a Bump, module_parts: &[&str]) -> &'a str { - let capacity = module_parts.len() * 3; // Module parts tend to be 3+ characters. - let mut buf = String::with_capacity_in(capacity, arena); - let mut any_parts_added = false; - - for part in module_parts { - if any_parts_added { - buf.push('.'); - } else { - any_parts_added = true; - } - - buf.push_str(part); - } - - buf.into_bump_str() -} - macro_rules! advance_state { ($state:expr, $n:expr) => { $state.advance_without_indenting_ee($n, |r, c| { @@ -185,30 +161,26 @@ fn malformed_identifier<'a>( _arena: &'a Bump, mut state: State<'a>, ) -> ParseResult<'a, Ident<'a>, EExpr<'a>> { + use encode_unicode::CharExt; // skip forward to the next non-identifier character - while !state.bytes.is_empty() { - match peek_utf8_char(&state) { - Ok((ch, bytes_parsed)) => { - // We can't use ch.is_alphanumeric() here because that passes for - // things that are "numeric" but not ASCII digits, like `¾` - if ch == '.' || ch == '_' || ch.is_alphabetic() || ch.is_ascii_digit() { - state = state.advance_without_indenting_ee(bytes_parsed, |r, c| { - EExpr::Space(crate::parser::BadInputError::LineTooLong, r, c) - })?; - continue; - } else { - break; - } - } - Err(_reason) => { - break; - } + let mut chomped = 0; + while let Ok((ch, width)) = char::from_utf8_slice_start(&state.bytes[chomped..]) { + // We can't use ch.is_alphanumeric() here because that passes for + // things that are "numeric" but not ASCII digits, like `¾` + if ch == '.' || ch == '_' || ch.is_alphabetic() || ch.is_ascii_digit() { + chomped += width; + continue; + } else { + break; } } - let parsed = &initial_bytes[..(initial_bytes.len() - state.bytes.len())]; + let delta = initial_bytes.len() - state.bytes.len(); + let parsed_str = unsafe { std::str::from_utf8_unchecked(&initial_bytes[..chomped + delta]) }; - let parsed_str = unsafe { std::str::from_utf8_unchecked(parsed) }; + state = state.advance_without_indenting_ee(chomped, |r, c| { + EExpr::Space(crate::parser::BadInputError::LineTooLong, r, c) + })?; Ok((MadeProgress, Ident::Malformed(parsed_str, problem), state)) } @@ -240,7 +212,6 @@ fn chomp_part(leading_is_good: F, buffer: &[u8]) -> Result<&str, Progress> where F: Fn(char) -> bool, { - // assumes the leading `.` has been chomped already use encode_unicode::CharExt; let mut chomped = 0; @@ -455,6 +426,47 @@ fn chomp_module_chain<'a>(buffer: &'a [u8]) -> Result { } } +pub fn concrete_type<'a>() -> impl Parser<'a, (&'a str, &'a str), ()> { + move |_, state: State<'a>| match chomp_concrete_type(state.bytes) { + Err(progress) => Err((progress, (), state)), + Ok((module_name, type_name, width)) => { + match state.advance_without_indenting_ee(width, |_, _| ()) { + Ok(state) => Ok((MadeProgress, (module_name, type_name), state)), + Err(bad) => Err(bad), + } + } + } +} + +// parse a type name like `Result` or `Result.Result` +fn chomp_concrete_type<'a>(buffer: &'a [u8]) -> Result<(&'a str, &'a str, usize), Progress> { + let first = crate::ident::chomp_uppercase_part(buffer)?; + + if let Some(b'.') = buffer.get(first.len()) { + match crate::ident::chomp_module_chain(&buffer[first.len()..]) { + Err(_) => Err(MadeProgress), + Ok(rest) => { + let width = first.len() + rest as usize; + let slice = &buffer[..width]; + + match slice.iter().rev().position(|c| *c == b'.') { + None => Ok(("", first, first.len())), + Some(rev_index) => { + let index = slice.len() - rev_index; + let module_name = + unsafe { std::str::from_utf8_unchecked(&slice[..index - 1]) }; + let type_name = unsafe { std::str::from_utf8_unchecked(&slice[index..]) }; + + Ok((module_name, type_name, width)) + } + } + } + } + } else { + Ok(("", first, first.len())) + } +} + fn chomp_access_chain<'a>(buffer: &'a [u8], parts: &mut Vec<'a, &'a str>) -> Result { let mut chomped = 0; diff --git a/compiler/parse/src/type_annotation.rs b/compiler/parse/src/type_annotation.rs index 5f7b6fb5f1..116e345f0e 100644 --- a/compiler/parse/src/type_annotation.rs +++ b/compiler/parse/src/type_annotation.rs @@ -1,6 +1,5 @@ use crate::ast::{AssignedField, Tag, TypeAnnotation}; use crate::blankspace::{space0_around_ee, space0_before_e, space0_e}; -use crate::ident::join_module_parts; use crate::keyword; use crate::parser::{ allocated, backtrackable, not_e, optional, peek_utf8_char_e, specialize, specialize_ref, word1, @@ -517,102 +516,12 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located fn parse_concrete_type<'a>( arena: &'a Bump, - mut state: State<'a>, + state: State<'a>, ) -> ParseResult<'a, TypeAnnotation<'a>, TApply> { - let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.) - let mut parts: Vec<&'a str> = Vec::new_in(arena); - - // Qualified types must start with a capitalized letter. - match peek_utf8_char_e(&state, TApply::StartNotUppercase, TApply::Space) { - Ok((first_letter, bytes_parsed)) => { - if first_letter.is_alphabetic() && first_letter.is_uppercase() { - part_buf.push(first_letter); - } else { - let problem = TApply::StartNotUppercase(state.line, state.column + 1); - return Err((NoProgress, problem, state)); - } - - state = state.advance_without_indenting_e(bytes_parsed, TApply::Space)?; - } - Err(reason) => return Err((NoProgress, reason, state)), - } - - while !state.bytes.is_empty() { - match peek_utf8_char_e(&state, TApply::End, TApply::Space) { - Ok((ch, bytes_parsed)) => { - // After the first character, only these are allowed: - // - // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers - // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric() - // * A dot ('.') - if ch.is_alphabetic() { - if part_buf.is_empty() && !ch.is_uppercase() { - // Each part must begin with a capital letter. - return Err(( - MadeProgress, - TApply::StartNotUppercase(state.line, state.column), - state, - )); - } - - part_buf.push(ch); - } else if ch.is_ascii_digit() { - // Parts may not start with numbers! - if part_buf.is_empty() { - return Err(( - MadeProgress, - TApply::StartIsNumber(state.line, state.column), - state, - )); - } - - part_buf.push(ch); - } else if ch == '.' { - // Having two consecutive dots is an error. - if part_buf.is_empty() { - return Err(( - MadeProgress, - TApply::DoubleDot(state.line, state.column), - state, - )); - } - - parts.push(part_buf.into_bump_str()); - - // Now that we've recorded the contents of the current buffer, reset it. - part_buf = String::new_in(arena); - } else { - // This must be the end of the type. We're done! - break; - } - - state = state.advance_without_indenting_e(bytes_parsed, TApply::Space)?; - } - Err(reason) => { - return Err((MadeProgress, reason, state)); - } - } - } - - if part_buf.is_empty() { - // We probably had a trailing dot, e.g. `Foo.bar.` - this is malformed! - // - // This condition might also occur if we encounter a malformed accessor like `.|` - // - // If we made it this far and don't have a next_char, then necessarily - // we have consumed a '.' char previously. - return Err(( - MadeProgress, - TApply::TrailingDot(state.line, state.column), - state, - )); - } - - let answer = TypeAnnotation::Apply( - join_module_parts(arena, parts.into_bump_slice()), - part_buf.into_bump_str(), - &[], - ); + let (_, (module_name, type_name), state) = + specialize(|_, r, c| TApply::End(r, c), crate::ident::concrete_type()) + .parse(arena, state)?; + let answer = TypeAnnotation::Apply(module_name, type_name, &[]); Ok((MadeProgress, answer, state)) }