Fix remaining UTF-8 parsing issues

This commit is contained in:
Richard Feldman 2020-07-26 21:38:29 -04:00
parent eaaeda728a
commit 273528db77
5 changed files with 358 additions and 267 deletions

View file

@ -1,7 +1,7 @@
use crate::ast::CommentOrNewline::{self, *}; use crate::ast::CommentOrNewline::{self, *};
use crate::ast::Spaceable; use crate::ast::Spaceable;
use crate::parser::{ use crate::parser::{
self, and, peek_utf8_char, unexpected, unexpected_eof, Fail, FailReason, Parser, State, self, and, peek_utf8_char, unexpected, unexpected_eof, FailReason, Parser, State,
}; };
use bumpalo::collections::string::String; use bumpalo::collections::string::String;
use bumpalo::collections::vec::Vec; use bumpalo::collections::vec::Vec;
@ -219,7 +219,7 @@ fn spaces<'a>(
move |arena: &'a Bump, state: State<'a>| { move |arena: &'a Bump, state: State<'a>| {
let original_state = state.clone(); let original_state = state.clone();
let mut space_list = Vec::new_in(arena); let mut space_list = Vec::new_in(arena);
let mut chars_parsed = 0; let mut bytes_parsed = 0;
let mut comment_line_buf = String::new_in(arena); let mut comment_line_buf = String::new_in(arena);
let mut line_state = LineState::Normal; let mut line_state = LineState::Normal;
let mut state = state; let mut state = state;
@ -227,8 +227,8 @@ fn spaces<'a>(
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
match peek_utf8_char(&state) { match peek_utf8_char(&state) {
Ok(ch) => { Ok((ch, utf8_len)) => {
chars_parsed += 1; bytes_parsed += utf8_len;
match line_state { match line_state {
LineState::Normal => { LineState::Normal => {
@ -263,7 +263,7 @@ fn spaces<'a>(
line_state = LineState::Comment; line_state = LineState::Comment;
} }
_ => { _ => {
return if require_at_least_one && chars_parsed <= 1 { return if require_at_least_one && bytes_parsed <= 1 {
// We've parsed 1 char and it was not a space, // We've parsed 1 char and it was not a space,
// but we require parsing at least one space! // but we require parsing at least one space!
Err(unexpected(0, state.clone(), state.attempting)) Err(unexpected(0, state.clone(), state.attempting))
@ -349,8 +349,7 @@ fn spaces<'a>(
line_state = LineState::Normal; line_state = LineState::Normal;
} }
nonblank => { nonblank => {
// Chars can have btye lengths of more than 1! state = state.advance_without_indenting(utf8_len)?;
state = state.advance_without_indenting(nonblank.len_utf8())?;
comment_line_buf.push(nonblank); comment_line_buf.push(nonblank);
} }
@ -358,21 +357,12 @@ fn spaces<'a>(
} }
} }
} }
Err(Fail { Err(FailReason::BadUtf8) => {
reason: FailReason::BadUtf8,
attempting,
}) => {
// If we hit an invalid UTF-8 character, bail out immediately. // If we hit an invalid UTF-8 character, bail out immediately.
return Err(( return state.fail(FailReason::BadUtf8);
Fail {
reason: dbg!(FailReason::BadUtf8),
attempting,
},
state,
));
} }
Err(_) => { Err(_) => {
if require_at_least_one && chars_parsed == 0 { if require_at_least_one && bytes_parsed == 0 {
return Err(unexpected_eof(0, state.attempting, state)); return Err(unexpected_eof(0, state.attempting, state));
} else { } else {
let space_slice = space_list.into_bump_slice(); let space_slice = space_list.into_bump_slice();

View file

@ -1,6 +1,6 @@
use crate::ast::Attempting; use crate::ast::Attempting;
use crate::keyword; use crate::keyword;
use crate::parser::{unexpected, utf8_char, Fail, FailReason, ParseResult, Parser, State}; use crate::parser::{peek_utf8_char, unexpected, Fail, FailReason, ParseResult, Parser, State};
use bumpalo::collections::string::String; use bumpalo::collections::string::String;
use bumpalo::collections::vec::Vec; use bumpalo::collections::vec::Vec;
use bumpalo::Bump; use bumpalo::Bump;
@ -69,7 +69,7 @@ impl<'a> Ident<'a> {
#[inline(always)] #[inline(always)]
pub fn parse_ident<'a>( pub fn parse_ident<'a>(
arena: &'a Bump, arena: &'a Bump,
state: State<'a>, mut state: State<'a>,
) -> ParseResult<'a, (Ident<'a>, Option<char>)> { ) -> ParseResult<'a, (Ident<'a>, Option<char>)> {
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.) let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena); let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena);
@ -80,93 +80,112 @@ pub fn parse_ident<'a>(
// Identifiers and accessor functions must start with either a letter or a dot. // Identifiers and accessor functions must start with either a letter or a dot.
// If this starts with neither, it must be something else! // If this starts with neither, it must be something else!
let (first_ch, mut state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((first_ch, bytes_parsed)) => {
if first_ch.is_alphabetic() {
part_buf.push(first_ch);
if first_ch.is_alphabetic() { is_capitalized = first_ch.is_uppercase();
part_buf.push(first_ch); is_accessor_fn = false;
is_capitalized = first_ch.is_uppercase(); state = state.advance_without_indenting(bytes_parsed)?;
is_accessor_fn = false; } else if first_ch == '.' {
} else if first_ch == '.' { is_capitalized = false;
is_capitalized = false; is_accessor_fn = true;
is_accessor_fn = true;
} else if first_ch == '@' {
// '@' must always be followed by a capital letter!
let (next_ch, new_state) = utf8_char().parse(arena, state)?;
state = new_state; state = state.advance_without_indenting(bytes_parsed)?;
} else if first_ch == '@' {
state = state.advance_without_indenting(bytes_parsed)?;
if next_ch.is_uppercase() { // '@' must always be followed by a capital letter!
part_buf.push('@'); match peek_utf8_char(&state) {
part_buf.push(next_ch); Ok((next_ch, next_bytes_parsed)) => {
if next_ch.is_uppercase() {
state = state.advance_without_indenting(next_bytes_parsed)?;
is_private_tag = true; part_buf.push('@');
is_capitalized = true; part_buf.push(next_ch);
is_accessor_fn = false;
} else { is_private_tag = true;
return Err(unexpected(0, state, Attempting::Identifier)); is_capitalized = true;
is_accessor_fn = false;
} else {
return Err(unexpected(
bytes_parsed + next_bytes_parsed,
state,
Attempting::Identifier,
));
}
}
Err(reason) => return state.fail(reason),
}
} else {
return Err(unexpected(0, state, Attempting::Identifier));
}
} }
} else { Err(reason) => return state.fail(reason),
return Err(unexpected(0, state, Attempting::Identifier));
} }
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((ch, bytes_parsed)) => {
// After the first character, only these are allowed:
//
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
// * A dot ('.')
if ch.is_alphabetic() {
if part_buf.is_empty() {
// Capitalization is determined by the first character in the part.
is_capitalized = ch.is_uppercase();
}
state = new_state; part_buf.push(ch);
} else if ch.is_ascii_digit() {
// Parts may not start with numbers!
if part_buf.is_empty() {
return malformed(
Some(ch),
arena,
state,
capitalized_parts,
noncapitalized_parts,
);
}
// After the first character, only these are allowed: part_buf.push(ch);
// } else if ch == '.' {
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers // There are two posssible errors here:
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric() //
// * A dot ('.') // 1. Having two consecutive dots is an error.
if ch.is_alphabetic() { // 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
if part_buf.is_empty() { if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
// Capitalization is determined by the first character in the part. return malformed(
is_capitalized = ch.is_uppercase(); Some(ch),
arena,
state,
capitalized_parts,
noncapitalized_parts,
);
}
if is_capitalized {
capitalized_parts.push(part_buf.into_bump_str());
} else {
noncapitalized_parts.push(part_buf.into_bump_str());
}
// Now that we've recorded the contents of the current buffer, reset it.
part_buf = String::new_in(arena);
} else {
// This must be the end of the identifier. We're done!
break;
}
state = state.advance_without_indenting(bytes_parsed)?;
} }
Err(reason) => return state.fail(reason),
part_buf.push(ch);
} else if ch.is_ascii_digit() {
// Parts may not start with numbers!
if part_buf.is_empty() {
return malformed(
Some(ch),
arena,
state,
capitalized_parts,
noncapitalized_parts,
);
}
part_buf.push(ch);
} else if ch == '.' {
// There are two posssible errors here:
//
// 1. Having two consecutive dots is an error.
// 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
return malformed(
Some(ch),
arena,
state,
capitalized_parts,
noncapitalized_parts,
);
}
if is_capitalized {
capitalized_parts.push(part_buf.into_bump_str());
} else {
noncapitalized_parts.push(part_buf.into_bump_str());
}
// Now that we've recorded the contents of the current buffer, reset it.
part_buf = String::new_in(arena);
} else {
// This must be the end of the identifier. We're done!
break;
} }
} }
@ -262,26 +281,27 @@ fn malformed<'a>(
let mut next_char = None; let mut next_char = None;
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((ch, bytes_parsed)) => {
// We can't use ch.is_alphanumeric() here because that passes for
// things that are "numeric" but not ASCII digits, like `¾`
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
full_string.push(ch);
} else {
next_char = Some(ch);
state = new_state; break;
}
// We can't use ch.is_alphanumeric() here because that passes for state = state.advance_without_indenting(bytes_parsed)?;
// things that are "numeric" but not ASCII digits, like `¾` }
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() { Err(reason) => return state.fail(reason),
full_string.push(ch);
} else {
next_char = Some(ch);
break;
} }
} }
let chars_parsed = full_string.len();
Ok(( Ok((
(Ident::Malformed(full_string.into_bump_str()), next_char), (Ident::Malformed(full_string.into_bump_str()), next_char),
state.advance_without_indenting(chars_parsed)?, state,
)) ))
} }
@ -298,42 +318,47 @@ pub fn global_tag_or_ident<'a, F>(pred: F) -> impl Parser<'a, &'a str>
where where
F: Fn(char) -> bool, F: Fn(char) -> bool,
{ {
move |arena, state: State<'a>| { move |arena, mut state: State<'a>| {
// pred will determine if this is a tag or ident (based on capitalization) // pred will determine if this is a tag or ident (based on capitalization)
let (first_letter, mut state) = utf8_char().parse(arena, state)?; let (first_letter, bytes_parsed) = match peek_utf8_char(&state) {
Ok((first_letter, bytes_parsed)) => {
if !pred(first_letter) {
return Err(unexpected(0, state, Attempting::RecordFieldLabel));
}
if !pred(first_letter) { (first_letter, bytes_parsed)
return Err(unexpected(0, state, Attempting::RecordFieldLabel)); }
} Err(reason) => return state.fail(reason),
};
let mut buf = String::with_capacity_in(1, arena); let mut buf = String::with_capacity_in(1, arena);
buf.push(first_letter); buf.push(first_letter);
state = state.advance_without_indenting(bytes_parsed)?;
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((ch, bytes_parsed)) => {
// After the first character, only these are allowed:
//
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
// * A ':' indicating the end of the field
if ch.is_alphabetic() || ch.is_ascii_digit() {
buf.push(ch);
state = new_state; state = state.advance_without_indenting(bytes_parsed)?;
} else {
// After the first character, only these are allowed: // This is the end of the field. We're done!
// break;
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers }
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric() }
// * A ':' indicating the end of the field Err(reason) => return state.fail(reason),
if ch.is_alphabetic() || ch.is_ascii_digit() { };
buf.push(ch);
} else {
// This is the end of the field. We're done!
break;
}
} }
let chars_parsed = buf.len(); Ok((buf.into_bump_str(), state))
Ok((
buf.into_bump_str(),
state.advance_without_indenting(chars_parsed)?,
))
} }
} }

View file

@ -7,7 +7,8 @@ use crate::expr::def;
use crate::header::ModuleName; use crate::header::ModuleName;
use crate::ident::unqualified_ident; use crate::ident::unqualified_ident;
use crate::parser::{ use crate::parser::{
self, ascii_char, ascii_string, loc, optional, unexpected, utf8_char, Parser, State, self, ascii_char, ascii_string, loc, optional, peek_utf8_char, peek_utf8_char_at, unexpected,
Parser, State,
}; };
use bumpalo::collections::{String, Vec}; use bumpalo::collections::{String, Vec};
use roc_region::all::Located; use roc_region::all::Located;
@ -61,57 +62,68 @@ pub fn interface_header<'a>() -> impl Parser<'a, InterfaceHeader<'a>> {
#[inline(always)] #[inline(always)]
pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> { pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
move |arena, state: State<'a>| { move |arena, mut state: State<'a>| {
let (first_letter, mut state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((first_letter, bytes_parsed)) => {
if !first_letter.is_uppercase() {
return Err(unexpected(0, state, Attempting::Module));
};
if !first_letter.is_uppercase() { let mut buf = String::with_capacity_in(1, arena);
return Err(unexpected(0, state, Attempting::Module));
};
let mut buf = String::with_capacity_in(1, arena); buf.push(first_letter);
buf.push(first_letter); state = state.advance_without_indenting(bytes_parsed)?;
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((ch, bytes_parsed)) => {
// After the first character, only these are allowed:
//
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
// * A '.' separating module parts
if ch.is_alphabetic() || ch.is_ascii_digit() {
buf.push(ch);
state = new_state; state = state.advance_without_indenting(bytes_parsed)?;
} else if ch == '.' {
match peek_utf8_char_at(&state, 1) {
Ok((next, next_bytes_parsed)) => {
if next.is_uppercase() {
// If we hit another uppercase letter, keep going!
buf.push('.');
buf.push(next);
// After the first character, only these are allowed: state = state.advance_without_indenting(
// bytes_parsed + next_bytes_parsed,
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers )?;
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric() } else {
// * A '.' separating module parts // We have finished parsing the module name.
if ch.is_alphabetic() || ch.is_ascii_digit() { //
buf.push(ch); // There may be an identifier after this '.',
} else if ch == '.' { // e.g. "baz" in `Foo.Bar.baz`
let (next, new_state) = utf8_char().parse(arena, state)?; return Ok((
ModuleName::new(buf.into_bump_str()),
state = new_state; state,
));
if next.is_uppercase() { }
// If we hit another uppercase letter, keep going! }
buf.push('.'); Err(reason) => return state.fail(reason),
buf.push(next); }
} else { } else {
let chars_parsed = buf.len(); // This is the end of the module name. We're done!
break;
// We have finished parsing the module name. }
// }
// There may be an identifier after this '.', Err(reason) => return state.fail(reason),
// e.g. "baz" in `Foo.Bar.baz` }
return Ok((
ModuleName::new(buf.into_bump_str()),
state.advance_without_indenting(chars_parsed)?,
));
} }
} else {
// This is the end of the module name. We're done!
break;
}
}
Ok((ModuleName::new(buf.into_bump_str()), state)) Ok((ModuleName::new(buf.into_bump_str()), state))
}
Err(reason) => state.fail(reason),
}
} }
} }

View file

@ -3,11 +3,12 @@ use bumpalo::collections::vec::Vec;
use bumpalo::Bump; use bumpalo::Bump;
use encode_unicode::CharExt; use encode_unicode::CharExt;
use roc_region::all::{Located, Region}; use roc_region::all::{Located, Region};
use std::fmt;
use std::str::from_utf8; use std::str::from_utf8;
use std::{char, mem, u16}; use std::{char, mem, u16};
/// A position in a source file. /// A position in a source file.
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Clone, PartialEq, Eq)]
pub struct State<'a> { pub struct State<'a> {
/// The raw input bytes from the file. /// The raw input bytes from the file.
pub bytes: &'a [u8], pub bytes: &'a [u8],
@ -101,7 +102,7 @@ impl<'a> State<'a> {
/// This assumes we are *not* advancing with spaces, or at least that /// This assumes we are *not* advancing with spaces, or at least that
/// any spaces on the line were preceded by non-spaces - which would mean /// any spaces on the line were preceded by non-spaces - which would mean
/// they weren't eligible to indent anyway. /// they weren't eligible to indent anyway.
pub fn advance_without_indenting(&self, quantity: usize) -> Result<Self, (Fail, Self)> { pub fn advance_without_indenting(self, quantity: usize) -> Result<Self, (Fail, Self)> {
match (self.column as usize).checked_add(quantity) { match (self.column as usize).checked_add(quantity) {
Some(column_usize) if column_usize <= u16::MAX as usize => { Some(column_usize) if column_usize <= u16::MAX as usize => {
Ok(State { Ok(State {
@ -184,6 +185,24 @@ impl<'a> State<'a> {
} }
} }
impl<'a> fmt::Debug for State<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "State {{")?;
match from_utf8(self.bytes) {
Ok(string) => write!(f, "\n\tbytes: [utf8] {:?}", string)?,
Err(_) => write!(f, "\n\tbytes: [invalid utf8] {:?}", self.bytes)?,
}
write!(f, "\n\t(line, col): ({}, {}),", self.line, self.column)?;
write!(f, "\n\tindent_col: {}", self.indent_col)?;
write!(f, "\n\tis_indenting: {:?}", self.is_indenting)?;
write!(f, "\n\tattempting: {:?}", self.attempting)?;
write!(f, "\n\toriginal_len: {}", self.original_len)?;
write!(f, "\n}}")
}
}
#[test] #[test]
fn state_size() { fn state_size() {
// State should always be under 8 machine words, so it fits in a typical // State should always be under 8 machine words, so it fits in a typical
@ -428,14 +447,14 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
/// A single UTF-8-encoded char. This will both parse *and* validate that the /// A single UTF-8-encoded char. This will both parse *and* validate that the
/// char is valid UTF-8. /// char is valid UTF-8.
pub fn utf8_char<'a>() -> impl Parser<'a, char> { pub fn utf8_char2<'a>() -> impl Parser<'a, char> {
move |_arena, state: State<'a>| { move |_arena, state: State<'a>| {
if !state.bytes.is_empty() { if !state.bytes.is_empty() {
match char::from_utf8_slice_start(state.bytes) { match char::from_utf8_slice_start(state.bytes) {
Ok((ch, bytes_parsed)) => { Ok((ch, bytes_parsed)) => {
return Ok((ch, state.advance_without_indenting(bytes_parsed)?)) return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
} }
Err(_) => return state.fail(dbg!(FailReason::BadUtf8)), Err(_) => return state.fail(FailReason::BadUtf8),
} }
} else { } else {
Err(unexpected_eof(0, state.attempting, state)) Err(unexpected_eof(0, state.attempting, state))
@ -445,17 +464,40 @@ pub fn utf8_char<'a>() -> impl Parser<'a, char> {
/// A single UTF-8-encoded char. This will both parse *and* validate that the /// A single UTF-8-encoded char. This will both parse *and* validate that the
/// char is valid UTF-8, but it will *not* advance the state. /// char is valid UTF-8, but it will *not* advance the state.
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<char, Fail> { pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
match char::from_utf8_slice_start(state.bytes) { if !state.bytes.is_empty() {
Ok((ch, _)) => Ok(ch), match char::from_utf8_slice_start(state.bytes) {
Err(_) => Err(Fail { Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
reason: dbg!(FailReason::BadUtf8), Err(_) => Err(FailReason::BadUtf8),
attempting: state.attempting, }
}), } else {
Err(FailReason::Eof(
Region::zero(), /* TODO get a better region */
))
} }
} }
/// A hardcoded string consisting only of ASCII characters. /// A single UTF-8-encoded char, with an offset. This will both parse *and*
/// validate that the char is valid UTF-8, but it will *not* advance the state.
pub fn peek_utf8_char_at<'a>(
state: &State<'a>,
offset: usize,
) -> Result<(char, usize), FailReason> {
if state.bytes.len() > offset {
let bytes = &state.bytes[offset..];
match char::from_utf8_slice_start(bytes) {
Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
Err(_) => Err(FailReason::BadUtf8),
}
} else {
Err(FailReason::Eof(
Region::zero(), /* TODO get a better region */
))
}
}
/// A hardcoded string with no newlines, consisting only of ASCII characters
pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> { pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
// Verify that this really is exclusively ASCII characters. // Verify that this really is exclusively ASCII characters.
// The `unsafe` block in this function relies upon this assumption! // The `unsafe` block in this function relies upon this assumption!
@ -472,10 +514,12 @@ pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
// SAFETY: Roc language keywords are statically known to contain only // SAFETY: Roc language keywords are statically known to contain only
// ASCII characters, which means their &str will be 100% u8 values in // ASCII characters, which means their &str will be 100% u8 values in
// memory, and thus can be safely interpreted as &[u8] // memory, and thus can be safely interpreted as &[u8]
Some(next_str) Some(next_str) => {
if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } => if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } {
{ Ok(((), state.advance_without_indenting(len)?))
Ok(((), state.advance_without_indenting(len)?)) } else {
Err(unexpected(len, state, Attempting::Keyword))
}
} }
_ => Err(unexpected_eof(0, Attempting::Keyword, state)), _ => Err(unexpected_eof(0, Attempting::Keyword, state)),
} }
@ -1126,6 +1170,6 @@ where
pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> { pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
match from_utf8(bytes) { match from_utf8(bytes) {
Ok(string) => Ok(string), Ok(string) => Ok(string),
Err(_) => Err(dbg!(FailReason::BadUtf8)), Err(_) => Err(FailReason::BadUtf8),
} }
} }

View file

@ -4,8 +4,8 @@ use crate::expr::{global_tag, private_tag};
use crate::ident::join_module_parts; use crate::ident::join_module_parts;
use crate::keyword; use crate::keyword;
use crate::parser::{ use crate::parser::{
allocated, ascii_char, ascii_string, not, optional, unexpected, utf8_char, Either, ParseResult, allocated, ascii_char, ascii_string, not, optional, peek_utf8_char, unexpected, Either,
Parser, State, ParseResult, Parser, State,
}; };
use bumpalo::collections::string::String; use bumpalo::collections::string::String;
use bumpalo::collections::vec::Vec; use bumpalo::collections::vec::Vec;
@ -263,61 +263,69 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>
fn parse_concrete_type<'a>( fn parse_concrete_type<'a>(
arena: &'a Bump, arena: &'a Bump,
state: State<'a>, mut state: State<'a>,
) -> ParseResult<'a, TypeAnnotation<'a>> { ) -> ParseResult<'a, TypeAnnotation<'a>> {
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.) let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
let mut parts: Vec<&'a str> = Vec::new_in(arena); let mut parts: Vec<&'a str> = Vec::new_in(arena);
// Qualified types must start with a capitalized letter. // Qualified types must start with a capitalized letter.
let (first_letter, mut state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((first_letter, bytes_parsed)) => {
if first_letter.is_alphabetic() && first_letter.is_uppercase() {
part_buf.push(first_letter);
} else {
return Err(unexpected(0, state, Attempting::ConcreteType));
}
if first_letter.is_alphabetic() && first_letter.is_uppercase() { state = state.advance_without_indenting(bytes_parsed)?;
part_buf.push(first_letter); }
} else { Err(reason) => return state.fail(reason),
return Err(unexpected(0, state, Attempting::ConcreteType));
} }
let mut next_char = None; let mut next_char = None;
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((ch, bytes_parsed)) => {
// After the first character, only these are allowed:
//
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
// * A dot ('.')
if ch.is_alphabetic() {
if part_buf.is_empty() && !ch.is_uppercase() {
// Each part must begin with a capital letter.
return malformed(Some(ch), arena, state, parts);
}
state = new_state; part_buf.push(ch);
} else if ch.is_ascii_digit() {
// Parts may not start with numbers!
if part_buf.is_empty() {
return malformed(Some(ch), arena, state, parts);
}
// After the first character, only these are allowed: part_buf.push(ch);
// } else if ch == '.' {
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers // Having two consecutive dots is an error.
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric() if part_buf.is_empty() {
// * A dot ('.') return malformed(Some(ch), arena, state, parts);
if ch.is_alphabetic() { }
if part_buf.is_empty() && !ch.is_uppercase() {
// Each part must begin with a capital letter. parts.push(part_buf.into_bump_str());
return malformed(Some(ch), arena, state, parts);
// Now that we've recorded the contents of the current buffer, reset it.
part_buf = String::new_in(arena);
} else {
// This must be the end of the type. We're done!
next_char = Some(ch);
break;
}
state = state.advance_without_indenting(bytes_parsed)?;
} }
Err(reason) => return state.fail(reason),
part_buf.push(ch);
} else if ch.is_ascii_digit() {
// Parts may not start with numbers!
if part_buf.is_empty() {
return malformed(Some(ch), arena, state, parts);
}
part_buf.push(ch);
} else if ch == '.' {
// Having two consecutive dots is an error.
if part_buf.is_empty() {
return malformed(Some(ch), arena, state, parts);
}
parts.push(part_buf.into_bump_str());
// Now that we've recorded the contents of the current buffer, reset it.
part_buf = String::new_in(arena);
} else {
// This must be the end of the type. We're done!
next_char = Some(ch);
break;
} }
} }
@ -349,31 +357,41 @@ fn parse_concrete_type<'a>(
fn parse_type_variable<'a>( fn parse_type_variable<'a>(
arena: &'a Bump, arena: &'a Bump,
state: State<'a>, mut state: State<'a>,
) -> ParseResult<'a, TypeAnnotation<'a>> { ) -> ParseResult<'a, TypeAnnotation<'a>> {
let mut buf = String::new_in(arena); let mut buf = String::new_in(arena);
let (first_letter, mut state) = utf8_char().parse(arena, state)?;
// Type variables must start with a lowercase letter. match peek_utf8_char(&state) {
if first_letter.is_alphabetic() && first_letter.is_lowercase() { Ok((first_letter, bytes_parsed)) => {
buf.push(first_letter); // Type variables must start with a lowercase letter.
} else { if first_letter.is_alphabetic() && first_letter.is_lowercase() {
return Err(unexpected(0, state, Attempting::TypeVariable)); buf.push(first_letter);
} else {
return Err(unexpected(0, state, Attempting::TypeVariable));
}
state = state.advance_without_indenting(bytes_parsed)?;
}
Err(reason) => return state.fail(reason),
} }
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((ch, bytes_parsed)) => {
// After the first character, only these are allowed:
//
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
if ch.is_alphabetic() || ch.is_ascii_digit() {
buf.push(ch);
} else {
// This must be the end of the type. We're done!
break;
}
state = new_state; state = state.advance_without_indenting(bytes_parsed)?;
// After the first character, only these are allowed: }
// Err(reason) => return state.fail(reason),
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
if ch.is_alphabetic() || ch.is_ascii_digit() {
buf.push(ch);
} else {
// This must be the end of the type. We're done!
break;
} }
} }
@ -399,22 +417,24 @@ fn malformed<'a>(
// Consume the remaining chars in the identifier. // Consume the remaining chars in the identifier.
while !state.bytes.is_empty() { while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?; match peek_utf8_char(&state) {
Ok((ch, bytes_parsed)) => {
// We can't use ch.is_alphanumeric() here because that passes for
// things that are "numeric" but not ASCII digits, like `¾`
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
full_string.push(ch);
} else {
break;
}
state = new_state; state = state.advance_without_indenting(bytes_parsed)?;
// We can't use ch.is_alphanumeric() here because that passes for }
// things that are "numeric" but not ASCII digits, like `¾` Err(reason) => return state.fail(reason),
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
full_string.push(ch);
} else {
break;
} }
} }
let chars_parsed = full_string.len();
Ok(( Ok((
TypeAnnotation::Malformed(full_string.into_bump_str()), TypeAnnotation::Malformed(full_string.into_bump_str()),
state.advance_without_indenting(chars_parsed)?, state,
)) ))
} }