mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-28 22:34:45 +00:00
Fix remaining UTF-8 parsing issues
This commit is contained in:
parent
eaaeda728a
commit
273528db77
5 changed files with 358 additions and 267 deletions
|
@ -1,7 +1,7 @@
|
||||||
use crate::ast::CommentOrNewline::{self, *};
|
use crate::ast::CommentOrNewline::{self, *};
|
||||||
use crate::ast::Spaceable;
|
use crate::ast::Spaceable;
|
||||||
use crate::parser::{
|
use crate::parser::{
|
||||||
self, and, peek_utf8_char, unexpected, unexpected_eof, Fail, FailReason, Parser, State,
|
self, and, peek_utf8_char, unexpected, unexpected_eof, FailReason, Parser, State,
|
||||||
};
|
};
|
||||||
use bumpalo::collections::string::String;
|
use bumpalo::collections::string::String;
|
||||||
use bumpalo::collections::vec::Vec;
|
use bumpalo::collections::vec::Vec;
|
||||||
|
@ -219,7 +219,7 @@ fn spaces<'a>(
|
||||||
move |arena: &'a Bump, state: State<'a>| {
|
move |arena: &'a Bump, state: State<'a>| {
|
||||||
let original_state = state.clone();
|
let original_state = state.clone();
|
||||||
let mut space_list = Vec::new_in(arena);
|
let mut space_list = Vec::new_in(arena);
|
||||||
let mut chars_parsed = 0;
|
let mut bytes_parsed = 0;
|
||||||
let mut comment_line_buf = String::new_in(arena);
|
let mut comment_line_buf = String::new_in(arena);
|
||||||
let mut line_state = LineState::Normal;
|
let mut line_state = LineState::Normal;
|
||||||
let mut state = state;
|
let mut state = state;
|
||||||
|
@ -227,8 +227,8 @@ fn spaces<'a>(
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
match peek_utf8_char(&state) {
|
match peek_utf8_char(&state) {
|
||||||
Ok(ch) => {
|
Ok((ch, utf8_len)) => {
|
||||||
chars_parsed += 1;
|
bytes_parsed += utf8_len;
|
||||||
|
|
||||||
match line_state {
|
match line_state {
|
||||||
LineState::Normal => {
|
LineState::Normal => {
|
||||||
|
@ -263,7 +263,7 @@ fn spaces<'a>(
|
||||||
line_state = LineState::Comment;
|
line_state = LineState::Comment;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
return if require_at_least_one && chars_parsed <= 1 {
|
return if require_at_least_one && bytes_parsed <= 1 {
|
||||||
// We've parsed 1 char and it was not a space,
|
// We've parsed 1 char and it was not a space,
|
||||||
// but we require parsing at least one space!
|
// but we require parsing at least one space!
|
||||||
Err(unexpected(0, state.clone(), state.attempting))
|
Err(unexpected(0, state.clone(), state.attempting))
|
||||||
|
@ -349,8 +349,7 @@ fn spaces<'a>(
|
||||||
line_state = LineState::Normal;
|
line_state = LineState::Normal;
|
||||||
}
|
}
|
||||||
nonblank => {
|
nonblank => {
|
||||||
// Chars can have btye lengths of more than 1!
|
state = state.advance_without_indenting(utf8_len)?;
|
||||||
state = state.advance_without_indenting(nonblank.len_utf8())?;
|
|
||||||
|
|
||||||
comment_line_buf.push(nonblank);
|
comment_line_buf.push(nonblank);
|
||||||
}
|
}
|
||||||
|
@ -358,21 +357,12 @@ fn spaces<'a>(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(Fail {
|
Err(FailReason::BadUtf8) => {
|
||||||
reason: FailReason::BadUtf8,
|
|
||||||
attempting,
|
|
||||||
}) => {
|
|
||||||
// If we hit an invalid UTF-8 character, bail out immediately.
|
// If we hit an invalid UTF-8 character, bail out immediately.
|
||||||
return Err((
|
return state.fail(FailReason::BadUtf8);
|
||||||
Fail {
|
|
||||||
reason: dbg!(FailReason::BadUtf8),
|
|
||||||
attempting,
|
|
||||||
},
|
|
||||||
state,
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
if require_at_least_one && chars_parsed == 0 {
|
if require_at_least_one && bytes_parsed == 0 {
|
||||||
return Err(unexpected_eof(0, state.attempting, state));
|
return Err(unexpected_eof(0, state.attempting, state));
|
||||||
} else {
|
} else {
|
||||||
let space_slice = space_list.into_bump_slice();
|
let space_slice = space_list.into_bump_slice();
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use crate::ast::Attempting;
|
use crate::ast::Attempting;
|
||||||
use crate::keyword;
|
use crate::keyword;
|
||||||
use crate::parser::{unexpected, utf8_char, Fail, FailReason, ParseResult, Parser, State};
|
use crate::parser::{peek_utf8_char, unexpected, Fail, FailReason, ParseResult, Parser, State};
|
||||||
use bumpalo::collections::string::String;
|
use bumpalo::collections::string::String;
|
||||||
use bumpalo::collections::vec::Vec;
|
use bumpalo::collections::vec::Vec;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
|
@ -69,7 +69,7 @@ impl<'a> Ident<'a> {
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn parse_ident<'a>(
|
pub fn parse_ident<'a>(
|
||||||
arena: &'a Bump,
|
arena: &'a Bump,
|
||||||
state: State<'a>,
|
mut state: State<'a>,
|
||||||
) -> ParseResult<'a, (Ident<'a>, Option<char>)> {
|
) -> ParseResult<'a, (Ident<'a>, Option<char>)> {
|
||||||
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
|
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
|
||||||
let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena);
|
let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena);
|
||||||
|
@ -80,93 +80,112 @@ pub fn parse_ident<'a>(
|
||||||
|
|
||||||
// Identifiers and accessor functions must start with either a letter or a dot.
|
// Identifiers and accessor functions must start with either a letter or a dot.
|
||||||
// If this starts with neither, it must be something else!
|
// If this starts with neither, it must be something else!
|
||||||
let (first_ch, mut state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((first_ch, bytes_parsed)) => {
|
||||||
|
if first_ch.is_alphabetic() {
|
||||||
|
part_buf.push(first_ch);
|
||||||
|
|
||||||
if first_ch.is_alphabetic() {
|
is_capitalized = first_ch.is_uppercase();
|
||||||
part_buf.push(first_ch);
|
is_accessor_fn = false;
|
||||||
|
|
||||||
is_capitalized = first_ch.is_uppercase();
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
is_accessor_fn = false;
|
} else if first_ch == '.' {
|
||||||
} else if first_ch == '.' {
|
is_capitalized = false;
|
||||||
is_capitalized = false;
|
is_accessor_fn = true;
|
||||||
is_accessor_fn = true;
|
|
||||||
} else if first_ch == '@' {
|
|
||||||
// '@' must always be followed by a capital letter!
|
|
||||||
let (next_ch, new_state) = utf8_char().parse(arena, state)?;
|
|
||||||
|
|
||||||
state = new_state;
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
|
} else if first_ch == '@' {
|
||||||
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
|
|
||||||
if next_ch.is_uppercase() {
|
// '@' must always be followed by a capital letter!
|
||||||
part_buf.push('@');
|
match peek_utf8_char(&state) {
|
||||||
part_buf.push(next_ch);
|
Ok((next_ch, next_bytes_parsed)) => {
|
||||||
|
if next_ch.is_uppercase() {
|
||||||
|
state = state.advance_without_indenting(next_bytes_parsed)?;
|
||||||
|
|
||||||
is_private_tag = true;
|
part_buf.push('@');
|
||||||
is_capitalized = true;
|
part_buf.push(next_ch);
|
||||||
is_accessor_fn = false;
|
|
||||||
} else {
|
is_private_tag = true;
|
||||||
return Err(unexpected(0, state, Attempting::Identifier));
|
is_capitalized = true;
|
||||||
|
is_accessor_fn = false;
|
||||||
|
} else {
|
||||||
|
return Err(unexpected(
|
||||||
|
bytes_parsed + next_bytes_parsed,
|
||||||
|
state,
|
||||||
|
Attempting::Identifier,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(reason) => return state.fail(reason),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err(unexpected(0, state, Attempting::Identifier));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
Err(reason) => return state.fail(reason),
|
||||||
return Err(unexpected(0, state, Attempting::Identifier));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
let (ch, new_state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((ch, bytes_parsed)) => {
|
||||||
|
// After the first character, only these are allowed:
|
||||||
|
//
|
||||||
|
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
|
||||||
|
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
||||||
|
// * A dot ('.')
|
||||||
|
if ch.is_alphabetic() {
|
||||||
|
if part_buf.is_empty() {
|
||||||
|
// Capitalization is determined by the first character in the part.
|
||||||
|
is_capitalized = ch.is_uppercase();
|
||||||
|
}
|
||||||
|
|
||||||
state = new_state;
|
part_buf.push(ch);
|
||||||
|
} else if ch.is_ascii_digit() {
|
||||||
|
// Parts may not start with numbers!
|
||||||
|
if part_buf.is_empty() {
|
||||||
|
return malformed(
|
||||||
|
Some(ch),
|
||||||
|
arena,
|
||||||
|
state,
|
||||||
|
capitalized_parts,
|
||||||
|
noncapitalized_parts,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// After the first character, only these are allowed:
|
part_buf.push(ch);
|
||||||
//
|
} else if ch == '.' {
|
||||||
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
|
// There are two posssible errors here:
|
||||||
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
//
|
||||||
// * A dot ('.')
|
// 1. Having two consecutive dots is an error.
|
||||||
if ch.is_alphabetic() {
|
// 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
|
||||||
if part_buf.is_empty() {
|
if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
|
||||||
// Capitalization is determined by the first character in the part.
|
return malformed(
|
||||||
is_capitalized = ch.is_uppercase();
|
Some(ch),
|
||||||
|
arena,
|
||||||
|
state,
|
||||||
|
capitalized_parts,
|
||||||
|
noncapitalized_parts,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if is_capitalized {
|
||||||
|
capitalized_parts.push(part_buf.into_bump_str());
|
||||||
|
} else {
|
||||||
|
noncapitalized_parts.push(part_buf.into_bump_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now that we've recorded the contents of the current buffer, reset it.
|
||||||
|
part_buf = String::new_in(arena);
|
||||||
|
} else {
|
||||||
|
// This must be the end of the identifier. We're done!
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
}
|
}
|
||||||
|
Err(reason) => return state.fail(reason),
|
||||||
part_buf.push(ch);
|
|
||||||
} else if ch.is_ascii_digit() {
|
|
||||||
// Parts may not start with numbers!
|
|
||||||
if part_buf.is_empty() {
|
|
||||||
return malformed(
|
|
||||||
Some(ch),
|
|
||||||
arena,
|
|
||||||
state,
|
|
||||||
capitalized_parts,
|
|
||||||
noncapitalized_parts,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
part_buf.push(ch);
|
|
||||||
} else if ch == '.' {
|
|
||||||
// There are two posssible errors here:
|
|
||||||
//
|
|
||||||
// 1. Having two consecutive dots is an error.
|
|
||||||
// 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
|
|
||||||
if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
|
|
||||||
return malformed(
|
|
||||||
Some(ch),
|
|
||||||
arena,
|
|
||||||
state,
|
|
||||||
capitalized_parts,
|
|
||||||
noncapitalized_parts,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if is_capitalized {
|
|
||||||
capitalized_parts.push(part_buf.into_bump_str());
|
|
||||||
} else {
|
|
||||||
noncapitalized_parts.push(part_buf.into_bump_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now that we've recorded the contents of the current buffer, reset it.
|
|
||||||
part_buf = String::new_in(arena);
|
|
||||||
} else {
|
|
||||||
// This must be the end of the identifier. We're done!
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -262,26 +281,27 @@ fn malformed<'a>(
|
||||||
let mut next_char = None;
|
let mut next_char = None;
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
let (ch, new_state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((ch, bytes_parsed)) => {
|
||||||
|
// We can't use ch.is_alphanumeric() here because that passes for
|
||||||
|
// things that are "numeric" but not ASCII digits, like `¾`
|
||||||
|
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
|
||||||
|
full_string.push(ch);
|
||||||
|
} else {
|
||||||
|
next_char = Some(ch);
|
||||||
|
|
||||||
state = new_state;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// We can't use ch.is_alphanumeric() here because that passes for
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
// things that are "numeric" but not ASCII digits, like `¾`
|
}
|
||||||
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
|
Err(reason) => return state.fail(reason),
|
||||||
full_string.push(ch);
|
|
||||||
} else {
|
|
||||||
next_char = Some(ch);
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let chars_parsed = full_string.len();
|
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
(Ident::Malformed(full_string.into_bump_str()), next_char),
|
(Ident::Malformed(full_string.into_bump_str()), next_char),
|
||||||
state.advance_without_indenting(chars_parsed)?,
|
state,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -298,42 +318,47 @@ pub fn global_tag_or_ident<'a, F>(pred: F) -> impl Parser<'a, &'a str>
|
||||||
where
|
where
|
||||||
F: Fn(char) -> bool,
|
F: Fn(char) -> bool,
|
||||||
{
|
{
|
||||||
move |arena, state: State<'a>| {
|
move |arena, mut state: State<'a>| {
|
||||||
// pred will determine if this is a tag or ident (based on capitalization)
|
// pred will determine if this is a tag or ident (based on capitalization)
|
||||||
let (first_letter, mut state) = utf8_char().parse(arena, state)?;
|
let (first_letter, bytes_parsed) = match peek_utf8_char(&state) {
|
||||||
|
Ok((first_letter, bytes_parsed)) => {
|
||||||
|
if !pred(first_letter) {
|
||||||
|
return Err(unexpected(0, state, Attempting::RecordFieldLabel));
|
||||||
|
}
|
||||||
|
|
||||||
if !pred(first_letter) {
|
(first_letter, bytes_parsed)
|
||||||
return Err(unexpected(0, state, Attempting::RecordFieldLabel));
|
}
|
||||||
}
|
Err(reason) => return state.fail(reason),
|
||||||
|
};
|
||||||
|
|
||||||
let mut buf = String::with_capacity_in(1, arena);
|
let mut buf = String::with_capacity_in(1, arena);
|
||||||
|
|
||||||
buf.push(first_letter);
|
buf.push(first_letter);
|
||||||
|
|
||||||
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
let (ch, new_state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((ch, bytes_parsed)) => {
|
||||||
|
// After the first character, only these are allowed:
|
||||||
|
//
|
||||||
|
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
|
||||||
|
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
||||||
|
// * A ':' indicating the end of the field
|
||||||
|
if ch.is_alphabetic() || ch.is_ascii_digit() {
|
||||||
|
buf.push(ch);
|
||||||
|
|
||||||
state = new_state;
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
|
} else {
|
||||||
// After the first character, only these are allowed:
|
// This is the end of the field. We're done!
|
||||||
//
|
break;
|
||||||
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
|
}
|
||||||
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
}
|
||||||
// * A ':' indicating the end of the field
|
Err(reason) => return state.fail(reason),
|
||||||
if ch.is_alphabetic() || ch.is_ascii_digit() {
|
};
|
||||||
buf.push(ch);
|
|
||||||
} else {
|
|
||||||
// This is the end of the field. We're done!
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let chars_parsed = buf.len();
|
Ok((buf.into_bump_str(), state))
|
||||||
|
|
||||||
Ok((
|
|
||||||
buf.into_bump_str(),
|
|
||||||
state.advance_without_indenting(chars_parsed)?,
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,8 @@ use crate::expr::def;
|
||||||
use crate::header::ModuleName;
|
use crate::header::ModuleName;
|
||||||
use crate::ident::unqualified_ident;
|
use crate::ident::unqualified_ident;
|
||||||
use crate::parser::{
|
use crate::parser::{
|
||||||
self, ascii_char, ascii_string, loc, optional, unexpected, utf8_char, Parser, State,
|
self, ascii_char, ascii_string, loc, optional, peek_utf8_char, peek_utf8_char_at, unexpected,
|
||||||
|
Parser, State,
|
||||||
};
|
};
|
||||||
use bumpalo::collections::{String, Vec};
|
use bumpalo::collections::{String, Vec};
|
||||||
use roc_region::all::Located;
|
use roc_region::all::Located;
|
||||||
|
@ -61,57 +62,68 @@ pub fn interface_header<'a>() -> impl Parser<'a, InterfaceHeader<'a>> {
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
|
pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
|
||||||
move |arena, state: State<'a>| {
|
move |arena, mut state: State<'a>| {
|
||||||
let (first_letter, mut state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((first_letter, bytes_parsed)) => {
|
||||||
|
if !first_letter.is_uppercase() {
|
||||||
|
return Err(unexpected(0, state, Attempting::Module));
|
||||||
|
};
|
||||||
|
|
||||||
if !first_letter.is_uppercase() {
|
let mut buf = String::with_capacity_in(1, arena);
|
||||||
return Err(unexpected(0, state, Attempting::Module));
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut buf = String::with_capacity_in(1, arena);
|
buf.push(first_letter);
|
||||||
|
|
||||||
buf.push(first_letter);
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
let (ch, new_state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((ch, bytes_parsed)) => {
|
||||||
|
// After the first character, only these are allowed:
|
||||||
|
//
|
||||||
|
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
|
||||||
|
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
||||||
|
// * A '.' separating module parts
|
||||||
|
if ch.is_alphabetic() || ch.is_ascii_digit() {
|
||||||
|
buf.push(ch);
|
||||||
|
|
||||||
state = new_state;
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
|
} else if ch == '.' {
|
||||||
|
match peek_utf8_char_at(&state, 1) {
|
||||||
|
Ok((next, next_bytes_parsed)) => {
|
||||||
|
if next.is_uppercase() {
|
||||||
|
// If we hit another uppercase letter, keep going!
|
||||||
|
buf.push('.');
|
||||||
|
buf.push(next);
|
||||||
|
|
||||||
// After the first character, only these are allowed:
|
state = state.advance_without_indenting(
|
||||||
//
|
bytes_parsed + next_bytes_parsed,
|
||||||
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
|
)?;
|
||||||
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
} else {
|
||||||
// * A '.' separating module parts
|
// We have finished parsing the module name.
|
||||||
if ch.is_alphabetic() || ch.is_ascii_digit() {
|
//
|
||||||
buf.push(ch);
|
// There may be an identifier after this '.',
|
||||||
} else if ch == '.' {
|
// e.g. "baz" in `Foo.Bar.baz`
|
||||||
let (next, new_state) = utf8_char().parse(arena, state)?;
|
return Ok((
|
||||||
|
ModuleName::new(buf.into_bump_str()),
|
||||||
state = new_state;
|
state,
|
||||||
|
));
|
||||||
if next.is_uppercase() {
|
}
|
||||||
// If we hit another uppercase letter, keep going!
|
}
|
||||||
buf.push('.');
|
Err(reason) => return state.fail(reason),
|
||||||
buf.push(next);
|
}
|
||||||
} else {
|
} else {
|
||||||
let chars_parsed = buf.len();
|
// This is the end of the module name. We're done!
|
||||||
|
break;
|
||||||
// We have finished parsing the module name.
|
}
|
||||||
//
|
}
|
||||||
// There may be an identifier after this '.',
|
Err(reason) => return state.fail(reason),
|
||||||
// e.g. "baz" in `Foo.Bar.baz`
|
}
|
||||||
return Ok((
|
|
||||||
ModuleName::new(buf.into_bump_str()),
|
|
||||||
state.advance_without_indenting(chars_parsed)?,
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// This is the end of the module name. We're done!
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok((ModuleName::new(buf.into_bump_str()), state))
|
Ok((ModuleName::new(buf.into_bump_str()), state))
|
||||||
|
}
|
||||||
|
Err(reason) => state.fail(reason),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,11 +3,12 @@ use bumpalo::collections::vec::Vec;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use encode_unicode::CharExt;
|
use encode_unicode::CharExt;
|
||||||
use roc_region::all::{Located, Region};
|
use roc_region::all::{Located, Region};
|
||||||
|
use std::fmt;
|
||||||
use std::str::from_utf8;
|
use std::str::from_utf8;
|
||||||
use std::{char, mem, u16};
|
use std::{char, mem, u16};
|
||||||
|
|
||||||
/// A position in a source file.
|
/// A position in a source file.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Clone, PartialEq, Eq)]
|
||||||
pub struct State<'a> {
|
pub struct State<'a> {
|
||||||
/// The raw input bytes from the file.
|
/// The raw input bytes from the file.
|
||||||
pub bytes: &'a [u8],
|
pub bytes: &'a [u8],
|
||||||
|
@ -101,7 +102,7 @@ impl<'a> State<'a> {
|
||||||
/// This assumes we are *not* advancing with spaces, or at least that
|
/// This assumes we are *not* advancing with spaces, or at least that
|
||||||
/// any spaces on the line were preceded by non-spaces - which would mean
|
/// any spaces on the line were preceded by non-spaces - which would mean
|
||||||
/// they weren't eligible to indent anyway.
|
/// they weren't eligible to indent anyway.
|
||||||
pub fn advance_without_indenting(&self, quantity: usize) -> Result<Self, (Fail, Self)> {
|
pub fn advance_without_indenting(self, quantity: usize) -> Result<Self, (Fail, Self)> {
|
||||||
match (self.column as usize).checked_add(quantity) {
|
match (self.column as usize).checked_add(quantity) {
|
||||||
Some(column_usize) if column_usize <= u16::MAX as usize => {
|
Some(column_usize) if column_usize <= u16::MAX as usize => {
|
||||||
Ok(State {
|
Ok(State {
|
||||||
|
@ -184,6 +185,24 @@ impl<'a> State<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> fmt::Debug for State<'a> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(f, "State {{")?;
|
||||||
|
|
||||||
|
match from_utf8(self.bytes) {
|
||||||
|
Ok(string) => write!(f, "\n\tbytes: [utf8] {:?}", string)?,
|
||||||
|
Err(_) => write!(f, "\n\tbytes: [invalid utf8] {:?}", self.bytes)?,
|
||||||
|
}
|
||||||
|
|
||||||
|
write!(f, "\n\t(line, col): ({}, {}),", self.line, self.column)?;
|
||||||
|
write!(f, "\n\tindent_col: {}", self.indent_col)?;
|
||||||
|
write!(f, "\n\tis_indenting: {:?}", self.is_indenting)?;
|
||||||
|
write!(f, "\n\tattempting: {:?}", self.attempting)?;
|
||||||
|
write!(f, "\n\toriginal_len: {}", self.original_len)?;
|
||||||
|
write!(f, "\n}}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn state_size() {
|
fn state_size() {
|
||||||
// State should always be under 8 machine words, so it fits in a typical
|
// State should always be under 8 machine words, so it fits in a typical
|
||||||
|
@ -428,14 +447,14 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
|
||||||
|
|
||||||
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
||||||
/// char is valid UTF-8.
|
/// char is valid UTF-8.
|
||||||
pub fn utf8_char<'a>() -> impl Parser<'a, char> {
|
pub fn utf8_char2<'a>() -> impl Parser<'a, char> {
|
||||||
move |_arena, state: State<'a>| {
|
move |_arena, state: State<'a>| {
|
||||||
if !state.bytes.is_empty() {
|
if !state.bytes.is_empty() {
|
||||||
match char::from_utf8_slice_start(state.bytes) {
|
match char::from_utf8_slice_start(state.bytes) {
|
||||||
Ok((ch, bytes_parsed)) => {
|
Ok((ch, bytes_parsed)) => {
|
||||||
return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
|
return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
|
||||||
}
|
}
|
||||||
Err(_) => return state.fail(dbg!(FailReason::BadUtf8)),
|
Err(_) => return state.fail(FailReason::BadUtf8),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Err(unexpected_eof(0, state.attempting, state))
|
Err(unexpected_eof(0, state.attempting, state))
|
||||||
|
@ -445,17 +464,40 @@ pub fn utf8_char<'a>() -> impl Parser<'a, char> {
|
||||||
|
|
||||||
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
||||||
/// char is valid UTF-8, but it will *not* advance the state.
|
/// char is valid UTF-8, but it will *not* advance the state.
|
||||||
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<char, Fail> {
|
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
|
||||||
match char::from_utf8_slice_start(state.bytes) {
|
if !state.bytes.is_empty() {
|
||||||
Ok((ch, _)) => Ok(ch),
|
match char::from_utf8_slice_start(state.bytes) {
|
||||||
Err(_) => Err(Fail {
|
Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
|
||||||
reason: dbg!(FailReason::BadUtf8),
|
Err(_) => Err(FailReason::BadUtf8),
|
||||||
attempting: state.attempting,
|
}
|
||||||
}),
|
} else {
|
||||||
|
Err(FailReason::Eof(
|
||||||
|
Region::zero(), /* TODO get a better region */
|
||||||
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A hardcoded string consisting only of ASCII characters.
|
/// A single UTF-8-encoded char, with an offset. This will both parse *and*
|
||||||
|
/// validate that the char is valid UTF-8, but it will *not* advance the state.
|
||||||
|
pub fn peek_utf8_char_at<'a>(
|
||||||
|
state: &State<'a>,
|
||||||
|
offset: usize,
|
||||||
|
) -> Result<(char, usize), FailReason> {
|
||||||
|
if state.bytes.len() > offset {
|
||||||
|
let bytes = &state.bytes[offset..];
|
||||||
|
|
||||||
|
match char::from_utf8_slice_start(bytes) {
|
||||||
|
Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
|
||||||
|
Err(_) => Err(FailReason::BadUtf8),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Err(FailReason::Eof(
|
||||||
|
Region::zero(), /* TODO get a better region */
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A hardcoded string with no newlines, consisting only of ASCII characters
|
||||||
pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
|
pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
|
||||||
// Verify that this really is exclusively ASCII characters.
|
// Verify that this really is exclusively ASCII characters.
|
||||||
// The `unsafe` block in this function relies upon this assumption!
|
// The `unsafe` block in this function relies upon this assumption!
|
||||||
|
@ -472,10 +514,12 @@ pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
|
||||||
// SAFETY: Roc language keywords are statically known to contain only
|
// SAFETY: Roc language keywords are statically known to contain only
|
||||||
// ASCII characters, which means their &str will be 100% u8 values in
|
// ASCII characters, which means their &str will be 100% u8 values in
|
||||||
// memory, and thus can be safely interpreted as &[u8]
|
// memory, and thus can be safely interpreted as &[u8]
|
||||||
Some(next_str)
|
Some(next_str) => {
|
||||||
if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } =>
|
if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } {
|
||||||
{
|
Ok(((), state.advance_without_indenting(len)?))
|
||||||
Ok(((), state.advance_without_indenting(len)?))
|
} else {
|
||||||
|
Err(unexpected(len, state, Attempting::Keyword))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
_ => Err(unexpected_eof(0, Attempting::Keyword, state)),
|
_ => Err(unexpected_eof(0, Attempting::Keyword, state)),
|
||||||
}
|
}
|
||||||
|
@ -1126,6 +1170,6 @@ where
|
||||||
pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
|
pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
|
||||||
match from_utf8(bytes) {
|
match from_utf8(bytes) {
|
||||||
Ok(string) => Ok(string),
|
Ok(string) => Ok(string),
|
||||||
Err(_) => Err(dbg!(FailReason::BadUtf8)),
|
Err(_) => Err(FailReason::BadUtf8),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,8 @@ use crate::expr::{global_tag, private_tag};
|
||||||
use crate::ident::join_module_parts;
|
use crate::ident::join_module_parts;
|
||||||
use crate::keyword;
|
use crate::keyword;
|
||||||
use crate::parser::{
|
use crate::parser::{
|
||||||
allocated, ascii_char, ascii_string, not, optional, unexpected, utf8_char, Either, ParseResult,
|
allocated, ascii_char, ascii_string, not, optional, peek_utf8_char, unexpected, Either,
|
||||||
Parser, State,
|
ParseResult, Parser, State,
|
||||||
};
|
};
|
||||||
use bumpalo::collections::string::String;
|
use bumpalo::collections::string::String;
|
||||||
use bumpalo::collections::vec::Vec;
|
use bumpalo::collections::vec::Vec;
|
||||||
|
@ -263,61 +263,69 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>
|
||||||
|
|
||||||
fn parse_concrete_type<'a>(
|
fn parse_concrete_type<'a>(
|
||||||
arena: &'a Bump,
|
arena: &'a Bump,
|
||||||
state: State<'a>,
|
mut state: State<'a>,
|
||||||
) -> ParseResult<'a, TypeAnnotation<'a>> {
|
) -> ParseResult<'a, TypeAnnotation<'a>> {
|
||||||
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
|
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
|
||||||
let mut parts: Vec<&'a str> = Vec::new_in(arena);
|
let mut parts: Vec<&'a str> = Vec::new_in(arena);
|
||||||
|
|
||||||
// Qualified types must start with a capitalized letter.
|
// Qualified types must start with a capitalized letter.
|
||||||
let (first_letter, mut state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((first_letter, bytes_parsed)) => {
|
||||||
|
if first_letter.is_alphabetic() && first_letter.is_uppercase() {
|
||||||
|
part_buf.push(first_letter);
|
||||||
|
} else {
|
||||||
|
return Err(unexpected(0, state, Attempting::ConcreteType));
|
||||||
|
}
|
||||||
|
|
||||||
if first_letter.is_alphabetic() && first_letter.is_uppercase() {
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
part_buf.push(first_letter);
|
}
|
||||||
} else {
|
Err(reason) => return state.fail(reason),
|
||||||
return Err(unexpected(0, state, Attempting::ConcreteType));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut next_char = None;
|
let mut next_char = None;
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
let (ch, new_state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((ch, bytes_parsed)) => {
|
||||||
|
// After the first character, only these are allowed:
|
||||||
|
//
|
||||||
|
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
|
||||||
|
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
||||||
|
// * A dot ('.')
|
||||||
|
if ch.is_alphabetic() {
|
||||||
|
if part_buf.is_empty() && !ch.is_uppercase() {
|
||||||
|
// Each part must begin with a capital letter.
|
||||||
|
return malformed(Some(ch), arena, state, parts);
|
||||||
|
}
|
||||||
|
|
||||||
state = new_state;
|
part_buf.push(ch);
|
||||||
|
} else if ch.is_ascii_digit() {
|
||||||
|
// Parts may not start with numbers!
|
||||||
|
if part_buf.is_empty() {
|
||||||
|
return malformed(Some(ch), arena, state, parts);
|
||||||
|
}
|
||||||
|
|
||||||
// After the first character, only these are allowed:
|
part_buf.push(ch);
|
||||||
//
|
} else if ch == '.' {
|
||||||
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
|
// Having two consecutive dots is an error.
|
||||||
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
if part_buf.is_empty() {
|
||||||
// * A dot ('.')
|
return malformed(Some(ch), arena, state, parts);
|
||||||
if ch.is_alphabetic() {
|
}
|
||||||
if part_buf.is_empty() && !ch.is_uppercase() {
|
|
||||||
// Each part must begin with a capital letter.
|
parts.push(part_buf.into_bump_str());
|
||||||
return malformed(Some(ch), arena, state, parts);
|
|
||||||
|
// Now that we've recorded the contents of the current buffer, reset it.
|
||||||
|
part_buf = String::new_in(arena);
|
||||||
|
} else {
|
||||||
|
// This must be the end of the type. We're done!
|
||||||
|
next_char = Some(ch);
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
}
|
}
|
||||||
|
Err(reason) => return state.fail(reason),
|
||||||
part_buf.push(ch);
|
|
||||||
} else if ch.is_ascii_digit() {
|
|
||||||
// Parts may not start with numbers!
|
|
||||||
if part_buf.is_empty() {
|
|
||||||
return malformed(Some(ch), arena, state, parts);
|
|
||||||
}
|
|
||||||
|
|
||||||
part_buf.push(ch);
|
|
||||||
} else if ch == '.' {
|
|
||||||
// Having two consecutive dots is an error.
|
|
||||||
if part_buf.is_empty() {
|
|
||||||
return malformed(Some(ch), arena, state, parts);
|
|
||||||
}
|
|
||||||
|
|
||||||
parts.push(part_buf.into_bump_str());
|
|
||||||
|
|
||||||
// Now that we've recorded the contents of the current buffer, reset it.
|
|
||||||
part_buf = String::new_in(arena);
|
|
||||||
} else {
|
|
||||||
// This must be the end of the type. We're done!
|
|
||||||
next_char = Some(ch);
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -349,31 +357,41 @@ fn parse_concrete_type<'a>(
|
||||||
|
|
||||||
fn parse_type_variable<'a>(
|
fn parse_type_variable<'a>(
|
||||||
arena: &'a Bump,
|
arena: &'a Bump,
|
||||||
state: State<'a>,
|
mut state: State<'a>,
|
||||||
) -> ParseResult<'a, TypeAnnotation<'a>> {
|
) -> ParseResult<'a, TypeAnnotation<'a>> {
|
||||||
let mut buf = String::new_in(arena);
|
let mut buf = String::new_in(arena);
|
||||||
let (first_letter, mut state) = utf8_char().parse(arena, state)?;
|
|
||||||
|
|
||||||
// Type variables must start with a lowercase letter.
|
match peek_utf8_char(&state) {
|
||||||
if first_letter.is_alphabetic() && first_letter.is_lowercase() {
|
Ok((first_letter, bytes_parsed)) => {
|
||||||
buf.push(first_letter);
|
// Type variables must start with a lowercase letter.
|
||||||
} else {
|
if first_letter.is_alphabetic() && first_letter.is_lowercase() {
|
||||||
return Err(unexpected(0, state, Attempting::TypeVariable));
|
buf.push(first_letter);
|
||||||
|
} else {
|
||||||
|
return Err(unexpected(0, state, Attempting::TypeVariable));
|
||||||
|
}
|
||||||
|
|
||||||
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
|
}
|
||||||
|
Err(reason) => return state.fail(reason),
|
||||||
}
|
}
|
||||||
|
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
let (ch, new_state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((ch, bytes_parsed)) => {
|
||||||
|
// After the first character, only these are allowed:
|
||||||
|
//
|
||||||
|
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
|
||||||
|
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
||||||
|
if ch.is_alphabetic() || ch.is_ascii_digit() {
|
||||||
|
buf.push(ch);
|
||||||
|
} else {
|
||||||
|
// This must be the end of the type. We're done!
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
state = new_state;
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
// After the first character, only these are allowed:
|
}
|
||||||
//
|
Err(reason) => return state.fail(reason),
|
||||||
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
|
|
||||||
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
|
|
||||||
if ch.is_alphabetic() || ch.is_ascii_digit() {
|
|
||||||
buf.push(ch);
|
|
||||||
} else {
|
|
||||||
// This must be the end of the type. We're done!
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -399,22 +417,24 @@ fn malformed<'a>(
|
||||||
|
|
||||||
// Consume the remaining chars in the identifier.
|
// Consume the remaining chars in the identifier.
|
||||||
while !state.bytes.is_empty() {
|
while !state.bytes.is_empty() {
|
||||||
let (ch, new_state) = utf8_char().parse(arena, state)?;
|
match peek_utf8_char(&state) {
|
||||||
|
Ok((ch, bytes_parsed)) => {
|
||||||
|
// We can't use ch.is_alphanumeric() here because that passes for
|
||||||
|
// things that are "numeric" but not ASCII digits, like `¾`
|
||||||
|
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
|
||||||
|
full_string.push(ch);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
state = new_state;
|
state = state.advance_without_indenting(bytes_parsed)?;
|
||||||
// We can't use ch.is_alphanumeric() here because that passes for
|
}
|
||||||
// things that are "numeric" but not ASCII digits, like `¾`
|
Err(reason) => return state.fail(reason),
|
||||||
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
|
|
||||||
full_string.push(ch);
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let chars_parsed = full_string.len();
|
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
TypeAnnotation::Malformed(full_string.into_bump_str()),
|
TypeAnnotation::Malformed(full_string.into_bump_str()),
|
||||||
state.advance_without_indenting(chars_parsed)?,
|
state,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue