Lazily validate that input bytes are valid UTF-8

This commit is contained in:
Richard Feldman 2020-07-25 22:12:42 -04:00
parent 15f087c93e
commit 9f9ce327d4
21 changed files with 709 additions and 626 deletions

View file

@ -6,7 +6,9 @@ use crate::blankspace::{space0_around, space1};
use crate::expr::def;
use crate::header::ModuleName;
use crate::ident::unqualified_ident;
use crate::parser::{self, char, loc, optional, string, unexpected, unexpected_eof, Parser, State};
use crate::parser::{
self, ascii_char, ascii_string, loc, optional, unexpected, utf8_char, Parser, State,
};
use bumpalo::collections::{String, Vec};
use roc_region::all::Located;
@ -30,7 +32,10 @@ pub fn app_module<'a>() -> impl Parser<'a, Module<'a>> {
pub fn interface_header<'a>() -> impl Parser<'a, InterfaceHeader<'a>> {
parser::map(
and!(
skip_first!(string("interface"), and!(space1(1), loc!(module_name()))),
skip_first!(
ascii_string("interface"),
and!(space1(1), loc!(module_name()))
),
and!(exposes(), imports())
),
|(
@ -57,32 +62,21 @@ pub fn interface_header<'a>() -> impl Parser<'a, InterfaceHeader<'a>> {
#[inline(always)]
pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
move |arena, state: State<'a>| {
let mut chars = state.input.chars();
let (first_letter, mut state) = utf8_char().parse(arena, state)?;
let first_letter = match chars.next() {
Some(first_char) => {
// Module names must all be uppercase
if first_char.is_uppercase() {
first_char
} else {
return Err(unexpected(
first_char,
0,
state,
Attempting::RecordFieldLabel,
));
}
}
None => {
return Err(unexpected_eof(0, Attempting::Identifier, state));
}
if !first_letter.is_uppercase() {
return Err(unexpected(0, state, Attempting::Module));
};
let mut buf = String::with_capacity_in(1, arena);
buf.push(first_letter);
while let Some(ch) = chars.next() {
while !state.bytes.is_empty() {
let (ch, new_state) = utf8_char().parse(arena, state)?;
state = new_state;
// After the first character, only these are allowed:
//
// * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
@ -91,24 +85,25 @@ pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
if ch.is_alphabetic() || ch.is_ascii_digit() {
buf.push(ch);
} else if ch == '.' {
match chars.next() {
Some(next) => {
if next.is_uppercase() {
// If we hit another uppercase letter, keep going!
buf.push('.');
buf.push(next);
} else {
// We have finished parsing the module name.
//
// There may be an identifier after this '.',
// e.g. "baz" in `Foo.Bar.baz`
break;
}
}
None => {
// A module name can't end with a '.'
return Err(unexpected_eof(0, Attempting::Identifier, state));
}
let (next, new_state) = utf8_char().parse(arena, state)?;
state = new_state;
if next.is_uppercase() {
// If we hit another uppercase letter, keep going!
buf.push('.');
buf.push(next);
} else {
let chars_parsed = buf.len();
// We have finished parsing the module name.
//
// There may be an identifier after this '.',
// e.g. "baz" in `Foo.Bar.baz`
return Ok((
ModuleName::new(buf.into_bump_str()),
state.advance_without_indenting(chars_parsed)?,
));
}
} else {
// This is the end of the module name. We're done!
@ -116,12 +111,7 @@ pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
}
}
let chars_parsed = buf.len();
Ok((
ModuleName::new(buf.into_bump_str()),
state.advance_without_indenting(chars_parsed)?,
))
Ok((ModuleName::new(buf.into_bump_str()), state))
}
}
@ -129,7 +119,7 @@ pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
fn app_header<'a>() -> impl Parser<'a, AppHeader<'a>> {
parser::map(
and!(
skip_first!(string("app"), and!(space1(1), loc!(module_name()))),
skip_first!(ascii_string("app"), and!(space1(1), loc!(module_name()))),
and!(provides(), imports())
),
|(
@ -167,8 +157,14 @@ fn provides<'a>() -> impl Parser<
),
> {
and!(
and!(skip_second!(space1(1), string("provides")), space1(1)),
collection!(char('['), loc!(exposes_entry()), char(','), char(']'), 1)
and!(skip_second!(space1(1), ascii_string("provides")), space1(1)),
collection!(
ascii_char('['),
loc!(exposes_entry()),
ascii_char(','),
ascii_char(']'),
1
)
)
}
@ -181,8 +177,14 @@ fn exposes<'a>() -> impl Parser<
),
> {
and!(
and!(skip_second!(space1(1), string("exposes")), space1(1)),
collection!(char('['), loc!(exposes_entry()), char(','), char(']'), 1)
and!(skip_second!(space1(1), ascii_string("exposes")), space1(1)),
collection!(
ascii_char('['),
loc!(exposes_entry()),
ascii_char(','),
ascii_char(']'),
1
)
)
}
@ -195,8 +197,14 @@ fn imports<'a>() -> impl Parser<
),
> {
and!(
and!(skip_second!(space1(1), string("imports")), space1(1)),
collection!(char('['), loc!(imports_entry()), char(','), char(']'), 1)
and!(skip_second!(space1(1), ascii_string("imports")), space1(1)),
collection!(
ascii_char('['),
loc!(imports_entry()),
ascii_char(','),
ascii_char(']'),
1
)
)
}
@ -213,8 +221,14 @@ fn imports_entry<'a>() -> impl Parser<'a, ImportsEntry<'a>> {
module_name(),
// e.g. `.{ Task, after}`
optional(skip_first!(
char('.'),
collection!(char('{'), loc!(exposes_entry()), char(','), char('}'), 1)
ascii_char('.'),
collection!(
ascii_char('{'),
loc!(exposes_entry()),
ascii_char(','),
ascii_char('}'),
1
)
))
),
|arena,