mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-29 06:44:46 +00:00
optimize type parser
This commit is contained in:
parent
30b47b9593
commit
92cff4c32a
2 changed files with 83 additions and 162 deletions
|
@ -1,14 +1,7 @@
|
||||||
use crate::ast::Attempting;
|
|
||||||
use crate::keyword;
|
|
||||||
use crate::parser::Progress::{self, *};
|
use crate::parser::Progress::{self, *};
|
||||||
use crate::parser::{
|
use crate::parser::{BadInputError, Col, EExpr, ParseResult, Parser, Row, State};
|
||||||
peek_utf8_char, unexpected, BadInputError, Col, EExpr, ParseResult, Parser, Row, State,
|
|
||||||
SyntaxError,
|
|
||||||
};
|
|
||||||
use bumpalo::collections::string::String;
|
|
||||||
use bumpalo::collections::vec::Vec;
|
use bumpalo::collections::vec::Vec;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use roc_region::all::Region;
|
|
||||||
|
|
||||||
/// The parser accepts all of these in any position where any one of them could
|
/// The parser accepts all of these in any position where any one of them could
|
||||||
/// appear. This way, canonicalization can give more helpful error messages like
|
/// appear. This way, canonicalization can give more helpful error messages like
|
||||||
|
@ -66,21 +59,20 @@ impl<'a> Ident<'a> {
|
||||||
/// * A record field, e.g. "email" in `.email` or in `email:`
|
/// * A record field, e.g. "email" in `.email` or in `email:`
|
||||||
/// * A named pattern match, e.g. "foo" in `foo =` or `foo ->` or `\foo ->`
|
/// * A named pattern match, e.g. "foo" in `foo =` or `foo ->` or `\foo ->`
|
||||||
pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
|
pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
|
||||||
debug!(
|
move |_, state: State<'a>| match chomp_lowercase_part(state.bytes) {
|
||||||
move |_, mut state: State<'a>| match chomp_lowercase_part(state.bytes) {
|
Err(progress) => Err((progress, (), state)),
|
||||||
Err(progress) => Err((progress, (), state)),
|
Ok(ident) => {
|
||||||
Ok(ident) => {
|
if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
|
||||||
if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
|
Err((MadeProgress, (), state))
|
||||||
Err((MadeProgress, (), state))
|
} else {
|
||||||
} else {
|
let width = ident.len();
|
||||||
let width = ident.len();
|
match state.advance_without_indenting_ee(width, |_, _| ()) {
|
||||||
state.column += width as u16;
|
Ok(state) => Ok((MadeProgress, ident, state)),
|
||||||
state.bytes = &state.bytes[width..];
|
Err(bad) => Err(bad),
|
||||||
Ok((MadeProgress, ident, state))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
)
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This could be:
|
/// This could be:
|
||||||
|
@ -89,51 +81,35 @@ pub fn lowercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
|
||||||
/// * A type name
|
/// * A type name
|
||||||
/// * A global tag
|
/// * A global tag
|
||||||
pub fn uppercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
|
pub fn uppercase_ident<'a>() -> impl Parser<'a, &'a str, ()> {
|
||||||
move |_, mut state: State<'a>| match chomp_uppercase_part(state.bytes) {
|
move |_, state: State<'a>| match chomp_uppercase_part(state.bytes) {
|
||||||
Err(progress) => Err((progress, (), state)),
|
Err(progress) => Err((progress, (), state)),
|
||||||
Ok(ident) => {
|
Ok(ident) => {
|
||||||
let width = ident.len();
|
let width = ident.len();
|
||||||
state.column += width as u16;
|
match state.advance_without_indenting_ee(width, |_, _| ()) {
|
||||||
state.bytes = &state.bytes[width..];
|
Ok(state) => Ok((MadeProgress, ident, state)),
|
||||||
Ok((MadeProgress, ident, state))
|
Err(bad) => Err(bad),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn unqualified_ident<'a>() -> impl Parser<'a, &'a str, ()> {
|
pub fn unqualified_ident<'a>() -> impl Parser<'a, &'a str, ()> {
|
||||||
move |_, mut state: State<'a>| match chomp_part(|c| c.is_alphabetic(), state.bytes) {
|
move |_, state: State<'a>| match chomp_part(|c| c.is_alphabetic(), state.bytes) {
|
||||||
Err(progress) => Err((progress, (), state)),
|
Err(progress) => Err((progress, (), state)),
|
||||||
Ok(ident) => {
|
Ok(ident) => {
|
||||||
if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
|
if crate::keyword::KEYWORDS.iter().any(|kw| &ident == kw) {
|
||||||
Err((MadeProgress, (), state))
|
Err((MadeProgress, (), state))
|
||||||
} else {
|
} else {
|
||||||
let width = ident.len();
|
let width = ident.len();
|
||||||
state.column += width as u16;
|
match state.advance_without_indenting_ee(width, |_, _| ()) {
|
||||||
state.bytes = &state.bytes[width..];
|
Ok(state) => Ok((MadeProgress, ident, state)),
|
||||||
Ok((MadeProgress, ident, state))
|
Err(bad) => Err(bad),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn join_module_parts<'a>(arena: &'a Bump, module_parts: &[&str]) -> &'a str {
|
|
||||||
let capacity = module_parts.len() * 3; // Module parts tend to be 3+ characters.
|
|
||||||
let mut buf = String::with_capacity_in(capacity, arena);
|
|
||||||
let mut any_parts_added = false;
|
|
||||||
|
|
||||||
for part in module_parts {
|
|
||||||
if any_parts_added {
|
|
||||||
buf.push('.');
|
|
||||||
} else {
|
|
||||||
any_parts_added = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
buf.push_str(part);
|
|
||||||
}
|
|
||||||
|
|
||||||
buf.into_bump_str()
|
|
||||||
}
|
|
||||||
|
|
||||||
macro_rules! advance_state {
|
macro_rules! advance_state {
|
||||||
($state:expr, $n:expr) => {
|
($state:expr, $n:expr) => {
|
||||||
$state.advance_without_indenting_ee($n, |r, c| {
|
$state.advance_without_indenting_ee($n, |r, c| {
|
||||||
|
@ -185,30 +161,26 @@ fn malformed_identifier<'a>(
|
||||||
_arena: &'a Bump,
|
_arena: &'a Bump,
|
||||||
mut state: State<'a>,
|
mut state: State<'a>,
|
||||||
) -> ParseResult<'a, Ident<'a>, EExpr<'a>> {
|
) -> ParseResult<'a, Ident<'a>, EExpr<'a>> {
|
||||||
|
use encode_unicode::CharExt;
|
||||||
// skip forward to the next non-identifier character
|
// skip forward to the next non-identifier character
|
||||||
while !state.bytes.is_empty() {
|
let mut chomped = 0;
|
||||||
match peek_utf8_char(&state) {
|
while let Ok((ch, width)) = char::from_utf8_slice_start(&state.bytes[chomped..]) {
|
||||||
Ok((ch, bytes_parsed)) => {
|
// We can't use ch.is_alphanumeric() here because that passes for
|
||||||
// We can't use ch.is_alphanumeric() here because that passes for
|
// things that are "numeric" but not ASCII digits, like `¾`
|
||||||
// things that are "numeric" but not ASCII digits, like `¾`
|
if ch == '.' || ch == '_' || ch.is_alphabetic() || ch.is_ascii_digit() {
|
||||||
if ch == '.' || ch == '_' || ch.is_alphabetic() || ch.is_ascii_digit() {
|
chomped += width;
|
||||||
state = state.advance_without_indenting_ee(bytes_parsed, |r, c| {
|
continue;
|
||||||
EExpr::Space(crate::parser::BadInputError::LineTooLong, r, c)
|
} else {
|
||||||
})?;
|
break;
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_reason) => {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let parsed = &initial_bytes[..(initial_bytes.len() - state.bytes.len())];
|
let delta = initial_bytes.len() - state.bytes.len();
|
||||||
|
let parsed_str = unsafe { std::str::from_utf8_unchecked(&initial_bytes[..chomped + delta]) };
|
||||||
|
|
||||||
let parsed_str = unsafe { std::str::from_utf8_unchecked(parsed) };
|
state = state.advance_without_indenting_ee(chomped, |r, c| {
|
||||||
|
EExpr::Space(crate::parser::BadInputError::LineTooLong, r, c)
|
||||||
|
})?;
|
||||||
|
|
||||||
Ok((MadeProgress, Ident::Malformed(parsed_str, problem), state))
|
Ok((MadeProgress, Ident::Malformed(parsed_str, problem), state))
|
||||||
}
|
}
|
||||||
|
@ -240,7 +212,6 @@ fn chomp_part<F>(leading_is_good: F, buffer: &[u8]) -> Result<&str, Progress>
|
||||||
where
|
where
|
||||||
F: Fn(char) -> bool,
|
F: Fn(char) -> bool,
|
||||||
{
|
{
|
||||||
// assumes the leading `.` has been chomped already
|
|
||||||
use encode_unicode::CharExt;
|
use encode_unicode::CharExt;
|
||||||
|
|
||||||
let mut chomped = 0;
|
let mut chomped = 0;
|
||||||
|
@ -455,6 +426,47 @@ fn chomp_module_chain<'a>(buffer: &'a [u8]) -> Result<u16, Progress> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn concrete_type<'a>() -> impl Parser<'a, (&'a str, &'a str), ()> {
|
||||||
|
move |_, state: State<'a>| match chomp_concrete_type(state.bytes) {
|
||||||
|
Err(progress) => Err((progress, (), state)),
|
||||||
|
Ok((module_name, type_name, width)) => {
|
||||||
|
match state.advance_without_indenting_ee(width, |_, _| ()) {
|
||||||
|
Ok(state) => Ok((MadeProgress, (module_name, type_name), state)),
|
||||||
|
Err(bad) => Err(bad),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse a type name like `Result` or `Result.Result`
|
||||||
|
fn chomp_concrete_type<'a>(buffer: &'a [u8]) -> Result<(&'a str, &'a str, usize), Progress> {
|
||||||
|
let first = crate::ident::chomp_uppercase_part(buffer)?;
|
||||||
|
|
||||||
|
if let Some(b'.') = buffer.get(first.len()) {
|
||||||
|
match crate::ident::chomp_module_chain(&buffer[first.len()..]) {
|
||||||
|
Err(_) => Err(MadeProgress),
|
||||||
|
Ok(rest) => {
|
||||||
|
let width = first.len() + rest as usize;
|
||||||
|
let slice = &buffer[..width];
|
||||||
|
|
||||||
|
match slice.iter().rev().position(|c| *c == b'.') {
|
||||||
|
None => Ok(("", first, first.len())),
|
||||||
|
Some(rev_index) => {
|
||||||
|
let index = slice.len() - rev_index;
|
||||||
|
let module_name =
|
||||||
|
unsafe { std::str::from_utf8_unchecked(&slice[..index - 1]) };
|
||||||
|
let type_name = unsafe { std::str::from_utf8_unchecked(&slice[index..]) };
|
||||||
|
|
||||||
|
Ok((module_name, type_name, width))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok(("", first, first.len()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn chomp_access_chain<'a>(buffer: &'a [u8], parts: &mut Vec<'a, &'a str>) -> Result<u16, u16> {
|
fn chomp_access_chain<'a>(buffer: &'a [u8], parts: &mut Vec<'a, &'a str>) -> Result<u16, u16> {
|
||||||
let mut chomped = 0;
|
let mut chomped = 0;
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
use crate::ast::{AssignedField, Tag, TypeAnnotation};
|
use crate::ast::{AssignedField, Tag, TypeAnnotation};
|
||||||
use crate::blankspace::{space0_around_ee, space0_before_e, space0_e};
|
use crate::blankspace::{space0_around_ee, space0_before_e, space0_e};
|
||||||
use crate::ident::join_module_parts;
|
|
||||||
use crate::keyword;
|
use crate::keyword;
|
||||||
use crate::parser::{
|
use crate::parser::{
|
||||||
allocated, backtrackable, not_e, optional, peek_utf8_char_e, specialize, specialize_ref, word1,
|
allocated, backtrackable, not_e, optional, peek_utf8_char_e, specialize, specialize_ref, word1,
|
||||||
|
@ -517,102 +516,12 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>
|
||||||
|
|
||||||
fn parse_concrete_type<'a>(
|
fn parse_concrete_type<'a>(
|
||||||
arena: &'a Bump,
|
arena: &'a Bump,
|
||||||
mut state: State<'a>,
|
state: State<'a>,
|
||||||
) -> ParseResult<'a, TypeAnnotation<'a>, TApply> {
|
) -> ParseResult<'a, TypeAnnotation<'a>, TApply> {
|
||||||
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
|
let (_, (module_name, type_name), state) =
|
||||||
let mut parts: Vec<&'a str> = Vec::new_in(arena);
|
specialize(|_, r, c| TApply::End(r, c), crate::ident::concrete_type())
|
||||||
|
.parse(arena, state)?;
|
||||||
// Qualified types must start with a capitalized letter.
|
let answer = TypeAnnotation::Apply(module_name, type_name, &[]);
|
||||||
match peek_utf8_char_e(&state, TApply::StartNotUppercase, TApply::Space) {
|
|
||||||
Ok((first_letter, bytes_parsed)) => {
|
|
||||||
if first_letter.is_alphabetic() && first_letter.is_uppercase() {
|
|
||||||
part_buf.push(first_letter);
|
|
||||||
} else {
|
|
||||||
let problem = TApply::StartNotUppercase(state.line, state.column + 1);
|
|
||||||
return Err((NoProgress, problem, state));
|
|
||||||
}
|
|
||||||
|
|
||||||
state = state.advance_without_indenting_e(bytes_parsed, TApply::Space)?;
|
|
||||||
}
|
|
||||||
Err(reason) => return Err((NoProgress, reason, state)),
|
|
||||||
}
|
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
|
||||||
match peek_utf8_char_e(&state, TApply::End, TApply::Space) {
|
|
||||||
Ok((ch, bytes_parsed)) => {
|
|
||||||
// After the first character, only these are allowed:
|
|
||||||
//
|
|
||||||
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
|
|
||||||
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
|
||||||
// * A dot ('.')
|
|
||||||
if ch.is_alphabetic() {
|
|
||||||
if part_buf.is_empty() && !ch.is_uppercase() {
|
|
||||||
// Each part must begin with a capital letter.
|
|
||||||
return Err((
|
|
||||||
MadeProgress,
|
|
||||||
TApply::StartNotUppercase(state.line, state.column),
|
|
||||||
state,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
part_buf.push(ch);
|
|
||||||
} else if ch.is_ascii_digit() {
|
|
||||||
// Parts may not start with numbers!
|
|
||||||
if part_buf.is_empty() {
|
|
||||||
return Err((
|
|
||||||
MadeProgress,
|
|
||||||
TApply::StartIsNumber(state.line, state.column),
|
|
||||||
state,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
part_buf.push(ch);
|
|
||||||
} else if ch == '.' {
|
|
||||||
// Having two consecutive dots is an error.
|
|
||||||
if part_buf.is_empty() {
|
|
||||||
return Err((
|
|
||||||
MadeProgress,
|
|
||||||
TApply::DoubleDot(state.line, state.column),
|
|
||||||
state,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
parts.push(part_buf.into_bump_str());
|
|
||||||
|
|
||||||
// Now that we've recorded the contents of the current buffer, reset it.
|
|
||||||
part_buf = String::new_in(arena);
|
|
||||||
} else {
|
|
||||||
// This must be the end of the type. We're done!
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
state = state.advance_without_indenting_e(bytes_parsed, TApply::Space)?;
|
|
||||||
}
|
|
||||||
Err(reason) => {
|
|
||||||
return Err((MadeProgress, reason, state));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if part_buf.is_empty() {
|
|
||||||
// We probably had a trailing dot, e.g. `Foo.bar.` - this is malformed!
|
|
||||||
//
|
|
||||||
// This condition might also occur if we encounter a malformed accessor like `.|`
|
|
||||||
//
|
|
||||||
// If we made it this far and don't have a next_char, then necessarily
|
|
||||||
// we have consumed a '.' char previously.
|
|
||||||
return Err((
|
|
||||||
MadeProgress,
|
|
||||||
TApply::TrailingDot(state.line, state.column),
|
|
||||||
state,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
let answer = TypeAnnotation::Apply(
|
|
||||||
join_module_parts(arena, parts.into_bump_slice()),
|
|
||||||
part_buf.into_bump_str(),
|
|
||||||
&[],
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok((MadeProgress, answer, state))
|
Ok((MadeProgress, answer, state))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue